In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

sns.set(rc={'figure.figsize':(11,8)})

In [2]:
df_src = pd.read_csv('data/titanic/train.csv')
df_src.head(1)
#df_src.isna().sum()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [3]:
def prepare_dataframe(df, train = True):
    if train:
        y = df['Survived']
        X = df.drop(['Survived'], axis = 1)
    else: 
        y = None
        X = df 
    X = X.fillna({'Age': X.Age.median()})
    X = X.fillna(0)
    X.set_index('PassengerId', inplace=True)
    X = X[['Pclass', 'Sex', 'Age', 'Fare']]
    X = pd.get_dummies(X)
    return X, y

In [4]:
# хочу сделать подбор гиперпараметров для deceision tree classifier
X, y = prepare_dataframe(df_src, train = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, y_train.shape,  X_test.shape, y_test.shape

((668, 5), (668,), (223, 5), (223,))

In [9]:
# делаем большой прогон по гиперпараметрам. 
i_arr = []
l_arr = []
s_arr = []
train_scores = []
test_scores = []

for i in range(1,10):
    for l in [2,3,5,10,15,20,25,50]:
        for s in [2,3,5,10,15,20,25,50]:

            clf = DecisionTreeClassifier(criterion='entropy', 
                                 min_samples_leaf=l, 
                                 min_samples_split=s,
                                 max_depth = i,
                                 random_state = 42)
            i_arr.append(i)
            l_arr.append(l)
            s_arr.append(s)
            
            train_scores.append(cross_val_score(clf, X_train, y_train, cv =5).mean()) 
            clf.fit( X_train, y_train)
            test_scores.append(clf.score(X_test, y_test))


In [10]:
df_res = pd.DataFrame({'depth':i_arr, 'min_leaf': l_arr, 'min_split': s_arr, 
                       'train_score': train_scores, 'test_score': test_scores})

In [11]:
df_res[df_res['test_score'] == df_res['test_score'].max()]

Unnamed: 0,depth,min_leaf,min_split,train_score,test_score
192,4,2,2,0.80689,0.825112
193,4,2,3,0.80689,0.825112
194,4,2,5,0.80689,0.825112
195,4,2,10,0.80689,0.825112
469,8,5,20,0.796353,0.825112
470,8,5,25,0.791909,0.825112
534,9,5,25,0.797924,0.825112


In [12]:
df_test = pd.read_csv('data/titanic/test.csv')
X_res, _ = prepare_dataframe(df_test, train = False)

clf = DecisionTreeClassifier(criterion='entropy', 
                             max_depth = 8, 
                             min_samples_leaf=5, 
                             min_samples_split=20,
                             random_state = 42)
clf.fit(X,y) 
X_res['Survived'] = clf.predict(X_res)
X_res[['Survived']].to_csv('data/titanic/result.csv')