https://www.kaggle.com/c/titanic

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pylab

In [2]:
df = pd.read_csv('train.csv')
df.head()

FileNotFoundError: [Errno 2] File train.csv does not exist: 'train.csv'

## Starting performs, new features

In [None]:
df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
len(df.PassengerId.unique()) == df.shape[0] # True
df.drop('PassengerId', axis = 1, inplace = True)

In [None]:
(df.Ticket.nunique())/df.shape[0] # delete feature ticket not bad idea, proportion of distinct values is high

In [None]:
df['Family'] = df.SibSp + df.Parch
df['Greeting'] = df.Name.apply(lambda name: name[name.index(',') + 1 : name.index('.')])
df['Lone_male_passenger'] = np.where((df['Family'] == 0) & (df['Sex'] == 'male'), 1, 0)
df['Not_alone_male_passenger'] = np.where((df['Family'] != 0) & (df['Sex'] == 'male') & (df['Age'] > 20), 1, 0)

## Fill the gaps

In [None]:
sns.barplot(x = df.isnull().sum()[df.isnull().sum() > 0].values, y = df.isnull().sum()[df.isnull().sum() > 0].index)
df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
print('Known values of cabins:', df[df.Cabin.notnull()].Survived.sum()/df[df.Cabin.notnull()].shape[0])
print('Nans-values of cabins:', df[df.Cabin.isnull()].Survived.sum()/df[df.Cabin.isnull()].shape[0])

In [None]:
df['Known_Cabin'] = df.Cabin.notnull()
df.Cabin.fillna('N', inplace = True)
df.Cabin = df.Cabin.apply(lambda x: x[0])

In [None]:
df.Embarked.fillna(df.Embarked.mode()[0], inplace = True)

In [None]:
greet_na = df.Greeting[df.Age.isnull()].value_counts().index
dit = {x: df[(df['Greeting'] == x) & (df.Age.notnull())].Age.median() for x in greet_na}
for i in df.index:
    if df.loc[i, 'Age'] != df.loc[i, 'Age']:
        df.loc[i, 'Age'] = dit[df.loc[i, 'Greeting']]

In [None]:
df['Round_Age'] = np.around(df['Age'] + 5, -1)
sns.countplot(df['Round_Age'], hue = df.Survived)

In [None]:
plt.figure(figsize = (12, 5))
df['Round_Fare'] = np.around(df.Fare, -1)
sns.countplot(df['Round_Fare'], hue = df.Survived)

def cat_fare(mas):
    cat = np.where(mas <= 10, 1, mas)
    cat = np.where((cat == 20) | (cat == 30), 2, cat)
    cat = np.where((cat == 40) | (cat == 50), 3, cat)
    cat = np.where((cat == 60) | (cat >= 80), 5, cat)
    cat = np.where((cat == 70), 4, cat)
    
    return cat.astype('str')

df['Round_Fare'] = cat_fare(np.around(df.Fare, -1))

In [None]:
sns.countplot(df['Round_Fare'], hue = df.Survived)

## Visualisation features

### Barplots

In [None]:
plt.figure(figsize = (12, 5))
df.Greeting = df.Greeting.apply(lambda x: x if x in df.Greeting.value_counts()[:3] else 'Other')
df.Greeting.value_counts()
sns.set_color_codes("pastel")
sns.barplot(x = df.Greeting.unique(), y = df.Greeting.value_counts().values, color = 'g')

sns.set_color_codes("muted")
sns.barplot(x = df[df.Survived == 1].Greeting.unique(), y = df[df.Survived == 1].Greeting.value_counts().values, color = 'r')
plt.title('Probability of surviving depending on salutation')
plt.text(2, 450, s = 'Green - count of survived, red - count of death')
plt.show()

In [None]:
i = 0
plt.figure(figsize = (12, 12))
for column in ['Pclass', 'Sex', 'Embarked', 'Known_Cabin', 'Family', 'Cabin', 'Round_Fare', 'SibSp', 'Parch']:
    i += 1
    pylab.subplot(3, 3, i)
    sns.barplot(x = column, y = 'Survived', data = df)
    plt.tight_layout()

All features, except cabin, have different proportion of survived in target classes

### Distribution plots

In [None]:
sns.distplot(df[df.Survived == 0].Fare)
sns.distplot(df[df.Survived == 1].Fare)
plt.title('Fare of tickets in target classes')
plt.show()

In [None]:
sns.distplot(df[df.Survived == 0].Age)
sns.distplot(df[df.Survived == 1].Age)
plt.title('Age in target classes')
plt.show()

## Preparing features for model

In [None]:
df.drop(['Ticket', 'Greeting', 'Cabin', 'Name', 'Family', 'Age', 'Fare'], axis = 1, inplace = True)

In [None]:
cat_fs = df.select_dtypes('O')

In [None]:
new_df = pd.concat([df, pd.get_dummies(cat_fs)], axis = 1)
new_df.head()

In [None]:
new_df.drop(['Sex', 'Embarked', 'Round_Fare'], axis =1, inplace = True)

In [None]:
new_df.info()

## Random forest and grad boosting

In [None]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, make_scorer, accuracy_score

### Baseline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_df.drop('Survived', axis = 1), new_df['Survived'], test_size = 0.25)

In [None]:
cls_forest = RandomForestClassifier()
cls_xgb = xgb.XGBClassifier(booster = 'gbtree')
scoring = {'f1': make_scorer(f1_score), 'ROC_AUC': make_scorer(roc_auc_score), 'accure': make_scorer(accuracy_score)}
CV = StratifiedShuffleSplit(test_size = 0.3)

In [None]:
score_cross_xgb = cross_validate(cls_xgb, X_train, y_train, scoring = scoring, cv = CV)

print('xgb, f1-score:', np.around(score_cross_xgb['test_f1'], decimals = 3), 
                'mean:', np.around(score_cross_xgb['test_f1'], decimals = 3).mean())

print('xgb, roc_auc-score:', np.around(score_cross_xgb['test_ROC_AUC'], decimals = 3),
                     'mean:', np.around(score_cross_xgb['test_ROC_AUC'], decimals = 3).mean())

print('xgb, accuracy-score:', np.around(score_cross_xgb['test_accure'], decimals = 3),
                     'mean:', np.around(score_cross_xgb['test_accure'], decimals = 3).mean())

In [None]:
score_cross_forest = cross_validate(cls_forest, X_train, y_train, scoring = scoring, cv = CV)

print('forest, f1-score:', np.around(score_cross_forest['test_f1'], decimals = 3), 
                                                      'mean:', np.around(score_cross_forest['test_f1'], decimals = 3).mean())

print('forest, roc_auc-score:', np.around(score_cross_forest['test_ROC_AUC'], decimals = 3),
                                                 'mean:', np.around(score_cross_forest['test_ROC_AUC'], decimals = 3).mean())

print('forest, accuracy-score:', np.around(score_cross_forest['test_accure'], decimals = 3),
                                                 'mean:', np.around(score_cross_forest['test_accure'], decimals = 3).mean())

There is'nt significant differences between two models, that way I'll train both models

### Forest

In [None]:
x_test.isnull().sum().index, X_train.isnull().sum().index

In [None]:
cls_forest = RandomForestClassifier()
CV = StratifiedShuffleSplit(test_size = 0.3)
params = {'n_estimators': [2000], 'max_depth': [2, 3]}
grid_search = GridSearchCV(cls_forest, params, cv = CV, scoring = make_scorer(f1_score), n_jobs = -1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_df.drop('Survived', axis = 1), new_df['Survived'], test_size = 0.25)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
bst_est_forest = grid_search.best_estimator_
print('Best_params: ', grid_search.best_params_)
print('Best_score: ', grid_search.best_score_)

In [None]:
y_pred = bst_est_forest.predict(X_test)

In [None]:
print('Matrix of TP, FP, FN, TN:\n', confusion_matrix(y_test, y_pred))
print('F-score:', f1_score(y_test, y_pred))
print('ROC_AUC-score:', roc_auc_score(y_test, y_pred))

### Gradient boosting trees

In [None]:
cls_xgb = xgb.XGBClassifier()

params = {'n_estimators': list(range(70, 500, 30)), 'max_depth': [2, 3, 4], 'booster': ['gbtree', 'dart'],
          'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3, 4], 'eta': [0.01, 0.05, 0.1, 0.5, 1]}
rand_search_xgb = RandomizedSearchCV(cls_xgb, params, cv = CV, scoring = make_scorer(roc_auc_score), n_jobs = -1)

In [None]:
rand_search_xgb.fit(X_train, y_train)

In [None]:
bst_est_xgb = rand_search_xgb.best_estimator_
print('Best_params:', rand_search_xgb.best_params_)
print('Best_score:', rand_search_xgb.best_score_)

In [None]:
y_pred = bst_est_xgb.predict(X_test)
print('Matrix of TP, FP, FN, TN:\n', confusion_matrix(y_test, y_pred))
print('F-score:', f1_score(y_test, y_pred))
print('ROC_AUC-score:', roc_auc_score(y_test, y_pred))

## Solution on test sample

In [None]:
test_df = pd.read_csv('test.csv')
test_df.drop(['PassengerId', 'Ticket'], axis = 1, inplace = True)
test_df.head(2)

In [None]:
med = test_df.Fare.median()
test_df.Fare = test_df.Fare.apply(lambda x: med if x!=x else x)

In [None]:
test_df['Family'] = test_df.SibSp + test_df.Parch
test_df['Greeting'] = test_df.Name.apply(lambda name: name[name.index(',') + 1 : name.index('.')])
test_df['Known_Cabin'] = test_df.Cabin.notnull()


test_df['Round_Fare'] = cat_fare(np.around(test_df.Fare, -1))
test_df['Round_Age'] = np.around(test_df.Age, -1)
test_df['Lone_male_passenger'] = np.where((test_df['Family'] == 0) & (test_df['Sex'] == 'male'), 1, 0)
test_df['Not_alone_male_passenger'] = np.where((test_df['Family'] != 0) & (test_df['Sex'] == 'male') & (test_df['Age'] > 20), 1, 0)

greet_na = test_df.Greeting[test_df.Age.isnull()].value_counts().index
dit = {x: test_df[(test_df['Greeting'] == x) & (test_df.Age.notnull())].Age.median() for x in greet_na}
for i in test_df.index:
    if test_df.loc[i, 'Age'] != test_df.loc[i, 'Age']:
        test_df.loc[i, 'Age'] = dit[test_df.loc[i, 'Greeting']]
      
test_df.drop(['Greeting', 'Cabin', 'Name', 'Family', 'Age', 'Fare'], axis = 1, inplace = True)

cat_fs = test_df.select_dtypes('O')
new_test_df = pd.concat([test_df, pd.get_dummies(cat_fs)], axis = 1)
new_test_df.drop(['Embarked', 'Sex', 'Round_Fare'], axis = 1, inplace = True)

Sent to kaggle

In [None]:
x_test = new_test_df
y_pred = bst_est_forest.predict(x_test)

In [154]:
answer = pd.DataFrame({'PassengerId': list(range(892, 891+419)), 'Survived': y_pred})
answer

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [155]:
answer.to_csv('Answer_titanic', index = False)

Идеи для создания признаков: 
* сделать признак-индикатор одинок ли пассажир, 
* является ли взрослым мужчиной


ошибка состоит из 3 частей: шум, смещение, разброс. Ошибка алгоритма реш деревьев обладает низким смещением, но высоким рзбросом. Чтобы избежать высокой дисперсии ошибок алгоритма необходимо сделать деревья в композиции более независимыми (менее коррелироваными) .
С этой задачей могут справится беггинг и метод случайных подпространств. (Взять не n признаков, а корень из n признаков)
файл 4-2 в конспектах с курсеры