In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [30]:
train_data = pd.read_csv('train.csv')
#train_data.head()

train_data.columns
train_data.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [31]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [32]:
full_data = pd.concat([train_data, test_data], ignore_index=True)  #full data for further optimisation

In [33]:
# some stats (bc I love stats)
total_passengers = (len(full_data))
print('total number of passengers is', total_passengers)

survived = full_data.loc[full_data.Survived == 1]['Survived']
s_rate = len(survived) / len(full_data)
print('% of survivors:', round(s_rate*100, 2))

avg_age = full_data['Age'].mean()
print('average age on the boat:', round(avg_age))

yg_survived = full_data.loc[full_data.Age <= 24]['Survived']
yg_survived_rate = len(yg_survived) / len(full_data)
print('% of people under 24 years to survive:', round(yg_survived_rate*100, 2))


total number of passengers is 1309
% of survivors: 26.13
average age on the boat: 30
% of people under 24 years to survive: 31.17


In [34]:
#lets now train our model

from sklearn.ensemble import RandomForestClassifier




y = train_data['Survived']

params = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"] 

X = pd.get_dummies(train_data[params])
X_test = pd.get_dummies(test_data[params])


model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)


"#now let's upgrade our data:\n\ndef remove_columns(df):\n    df.columns = df.columns.str.strip()\n    df = df.drop(columns = ['Name', 'Ticket', 'Cabin'])\n    return df\n\ndef fill_missing_values(df):\n    most_freq = df['Embarked'].mode()[0]\n    df['Embarked'] = df['Embarked'].fillna(most_freq)\n    median_age = df['Age'].median()\n    df['Age'] = df['Age'].fillna(median_age)\n    median_fare = df['Fare'].median()\n    df['Fare'] = df['Fare'].fillna(median_fare)\n    return df\n\n#def add_features(df):\n\ntrain_data=remove_columns(train_data)\ntrain_data=fill_missing_values(train_data)\n\ntest_data=remove_columns(test_data)\ntest_data=fill_missing_values(test_data)\n\n"

In [47]:
#import some more things

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV, StratifiedKFold,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [37]:
num = ["Pclass", "SibSp", "Parch", "Age", "Fare"]  
cat = ["Sex", "Embarked"]


preprocess = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat),
])

In [38]:
model = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42)),
])

X = train_data[num + cat]
y = train_data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

model.fit(X_train, y_train)
print("Val accuracy:", model.score(X_test, y_test))

predictions = model.predict(test_data[num + cat])

Val accuracy: 0.7985074626865671


In [None]:
#now let's try to select what feature are important

from sklearn.inspection import permutation_importance
from scipy.stats import randint

param_grid = {
    "clf__n_estimators": randint(150, 701),           # 200..700
    "clf__max_depth": [None] + list(range(3, 13)),    # None or 3..12
    "clf__min_samples_leaf": randint(1, 8),           # 1..5
    "clf__min_samples_split": randint(2, 14),         # 2..10
    "clf__max_features": ["sqrt", None, 0.5, 0.8],
    "clf__class_weight": [None, "balanced"],
}



cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=55,              # 
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    random_state=42
)

rs.fit(X, y)
print("Best Params:", rs.best_params_)
print("Best CV-Presicion:", rs.best_score_)

best_model = rs.best_estimator_   # уже дообучен на всем X,y




Best Params: {'clf__class_weight': None, 'clf__max_depth': None, 'clf__max_features': None, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 13, 'clf__n_estimators': 374}
Best CV-Presicion: 0.84509446990145


'\nimport matplotlib.pyplot as plt\n\n# --- 0) Забираем результаты CV ---\ncvres = pd.DataFrame(rs.cv_results_)  # если GridSearchCV: gs.cv_results_\ncols = [\'mean_test_score\',\'std_test_score\',\'rank_test_score\'] +        [c for c in cvres.columns if c.startswith(\'param_\')]\ndf = cvres[cols].sort_values(\'rank_test_score\').reset_index(drop=True)\n\nbest_score = rs.best_score_\nbest_params = rs.best_params_\nprint("Лучшие параметры:", best_params)\nprint(f"Лучшая средняя CV-точность: {best_score:.4f}")\n\n# --- 1) Топ-20 конфигураций ---\ntopn = 20 if len(df) >= 20 else len(df)\nplt.figure(figsize=(10,5))\nplt.bar(range(topn), df.loc[:topn-1, \'mean_test_score\'])\nplt.axhline(best_score, linestyle=\'--\')\nplt.xticks(range(topn), df.loc[:topn-1, \'rank_test_score\'], rotation=0)\nplt.xlabel(\'rank_test_score (меньше = лучше)\')\nplt.ylabel(\'CV accuracy\')\nplt.title(\'Топ-конфигурации по CV\')\nplt.tight_layout()\nplt.show()\n\n\nparam_names = [c for c in df.columns if c.start

In [54]:
#let's find what parameters are important

from sklearn.base import clone

X = train_data[num + cat]
y = train_data["Survived"]

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# 2) Обучаем клон лучшей модели только на трейне
model_val = clone(best_model)
model_val.fit(X_tr, y_tr)

# 3) Permutation importance по ИСХОДНЫМ признакам (длина = len(X_val.columns))
perm = permutation_importance(
    model_val, X_val, y_val,
    n_repeats=20, random_state=42, n_jobs=-1, scoring="accuracy"
)
pi_orig = pd.Series(perm.importances_mean, index=X_val.columns).sort_values(ascending=False)

# 4) Встроенная важность леса (после препроцессинга) и агрегация к исходным колонкам
feat_names = model_val.named_steps["prep"].get_feature_names_out()
rf_imp = pd.Series(model_val.named_steps["clf"].feature_importances_, index=feat_names)

def to_orig(n: str) -> str:
    n = n.split("__", 1)[-1]   # убираем "num__"/"cat__"
    return n.split("_", 1)[0]  # "Sex_female" -> "Sex"

groups = pd.Series([to_orig(n) for n in feat_names], index=feat_names)
rf_imp_agg = rf_imp.groupby(groups).sum()

# 5) Сводная таблица: выравниваем по исходным признакам, чтобы не было рассинхрона
summary = pd.concat(
    [pi_orig.rename("Permutation_importance"),
     rf_imp_agg.rename("RF_importance")],
    axis=1
).fillna(0).sort_values("Permutation_importance", ascending=False)

print(summary.head(10))

          Permutation_importance  RF_importance
Sex                     0.189686       0.440879
Pclass                  0.095067       0.125320
Fare                    0.019731       0.182805
Age                     0.018834       0.190689
Parch                   0.003363       0.013445
Embarked               -0.001570       0.024462
SibSp                  -0.009641       0.022399


In [56]:
predictions = best_model.predict(test_data[num + cat])

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
