In [62]:
from sklearn import tree
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Импортируем код для визулизации дерева решений

from IPython.display import SVG
from graphviz import Source
from IPython.display import display

from IPython.display import HTML
# style = "<style>svg{width:50% !important;height:50% !important;}</style>"
# HTML(style)

In [2]:
# Импортируем данные из соревнования, сохраняя путь к файлам в переменной DATA_DIR
DATA_DIR = 'data/'
train = pd.read_csv(DATA_DIR + 'train.csv')
train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Id
0,1,85,66,29,0,26.6,0.351,31,0,1
1,8,183,64,0,0,23.3,0.672,32,1,2
2,1,89,66,23,94,28.1,0.167,21,0,3
3,0,137,40,35,168,43.1,2.288,33,1,4
4,5,116,74,0,0,25.6,0.201,30,0,5


In [38]:
# проверка пустых строк
for column in train.columns:
    print(pd.isna(train[column]).value_counts())

False    552
Name: Pregnancies, dtype: int64
False    552
Name: Glucose, dtype: int64
False    552
Name: BloodPressure, dtype: int64
False    552
Name: SkinThickness, dtype: int64
False    552
Name: Insulin, dtype: int64
False    552
Name: BMI, dtype: int64
False    552
Name: DiabetesPedigreeFunction, dtype: int64
False    552
Name: Age, dtype: int64
False    552
Name: Outcome, dtype: int64
False    552
Name: Id, dtype: int64


In [3]:
X = train.drop(['Outcome'], axis=1)
X = StandardScaler().fit_transform(X)
y = train.Outcome

In [4]:
# clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=6)

In [62]:
# outdated block of code:

# def my_custom_scorer(X, y):
#     f_score_list = []
#     for train_index, test_index in kf.split(X):
#         X_train, X_val = X[train_index], X[test_index]
#         Y_train, Y_val = y.iloc[train_index], y.iloc[test_index]

#         clf.fit(X_train, Y_train)
#         Y_pred = clf.predict(X_val)
#         f_score_list.append(f1_score(Y_val, Y_pred))
#     return sum(f_score_list)/len(f_score_list)

# print(my_custom_scorer(X, y))
# my_scorer = make_scorer(my_custom_scorer, greater_is_better=True)

0.6105177571145303


In [43]:
# бейзлайн 
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=42)

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5, 6, 7, 8, 9]    
}

clf_rand = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=3, cv=5, n_jobs=-1, random_state=42)

search = clf_rand.fit(X, y)
dt_best_params = search.best_params_
print('best_params_ ', search.best_params_)
print('best_score_ ', search.best_score_)

best_params_  {'splitter': 'best', 'max_depth': 5, 'criterion': 'gini'}
best_score_  0.7176248976248977


In [42]:
# Random Forest 
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)

param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=2000, num=20)],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [int(x) for x in np.linspace(start=2, stop=20, num=19)],
    'max_depth': [n for n in range(11)]    
}

clf_rand = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=3, cv=5, n_jobs=-1, random_state=42)

search = clf_rand.fit(X, y)
rf_best_params = search.best_params_
print('best_params_ ', search.best_params_)
print('best_score_ ', search.best_score_)


best_params_  {'n_estimators': 1100, 'min_samples_split': 5, 'max_depth': 3, 'criterion': 'entropy'}
best_score_  0.75007371007371


In [46]:
# Gradient boosting
# Random Forest 
clf = GradientBoostingClassifier(n_estimators=200, random_state=42)

param_grid = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [x for x in np.linspace(start=0, stop=0.2, num=100)],
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=2000, num=20)],
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'min_samples_split': [int(x) for x in np.linspace(start=2, stop=20, num=19)],
    'max_depth': [n for n in range(11)]    
}

clf_rand = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=3, cv=5, n_jobs=-1, random_state=42)

search = clf_rand.fit(X, y)
gb_best_params = search.best_params_
print('best_params_ ', search.best_params_)
print('best_score_ ', search.best_score_)


best_params_  {'n_estimators': 500, 'min_samples_split': 2, 'max_depth': 3, 'loss': 'exponential', 'learning_rate': 0.13333333333333333, 'criterion': 'mae'}
best_score_  0.7318755118755119


In [84]:
# Ensemble of 3 models 
clf_dt = tree.DecisionTreeClassifier(**dt_best_params)
clf_rf = RandomForestClassifier(**rf_best_params)
clf_gb = GradientBoostingClassifier(**gb_best_params)

classifiers = [
    ('dt', clf_dt), 
    ('rf', clf_rf), 
    ('gb', clf_gb)
    ]



meta_clf = LogisticRegression()
stacking_clf = StackingClassifier(estimators=classifiers, final_estimator=meta_clf, cv=10, stack_method='predict_proba', n_jobs=-1, passthrough=True)

X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.2)
kf = KFold(n_splits=10, shuffle=True)

stacking_clf.fit(X_train, Y_train).score(X_val, Y_val)


0.7927927927927928

In [85]:
stacking_clf.get_params

<bound method _BaseHeterogeneousEnsemble.get_params of StackingClassifier(cv=10,
                   estimators=[('dt', DecisionTreeClassifier(max_depth=5)),
                               ('rf',
                                RandomForestClassifier(criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_split=5,
                                                       n_estimators=1100)),
                               ('gb',
                                GradientBoostingClassifier(criterion='mae',
                                                           learning_rate=0.13333333333333333,
                                                           loss='exponential',
                                                           n_estimators=500))],
                   final_estimator=LogisticRegression(), n_jobs=-1,
                   passthrough=True, stack_method='predict_proba')>

In [86]:
# saving results on test
test = pd.read_csv(DATA_DIR + 'test.csv')
test = StandardScaler().fit_transform(test)

submission = stacking_clf.predict(test)

In [87]:
sample_submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')
# Сохраняем предсказания в отдельный файл, он будет доступен для скачивания справа 
sample_submission['Outcome'] = submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)

Unnamed: 0,Id,Outcome
0,0,1
1,9,0
2,11,1
3,12,1
4,14,1
5,17,0
6,20,1
7,22,1
8,24,1
9,26,1
