In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
%matplotlib inline

data_train = pd.read_csv('/home/dan/Test/Kaggle/data/Titanic/train.csv')
data_test = pd.read_csv('/home/dan/Test/Kaggle/data/Titanic/test.csv')

data_train.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
476,477,0,2,"Renouf, Mr. Peter Henry",male,34.0,1,0,31027,21.0,,S
208,209,1,3,"Carr, Miss. Helen ""Ellen""",female,16.0,0,0,367231,7.75,,Q


## Survival - выжил
## SibSp - братьев и сестер на борту
## parch - родителей детей на борту
## fare - тариф
## Embarked - порт погрузки C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
sns.barplot(x="Embarked", y="Survived", hue="Parch", data=data_train);

In [None]:
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=data_train,
              palette={"male": "blue", "female": "pink"},
              markers=["*", "o"], linestyles=["-", "--"]);

## Преобразуем признаки.

In [2]:
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df
def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df
def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df
def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked', 'Lname'], axis=1)

## Применяем к нашим выборкам

In [3]:
def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

In [4]:
data_train = transform_features(data_train)

In [5]:
data_test = transform_features(data_test)

## Последняя часть фазы предварительной обработки - это нормализация меток. LabelEncoder в Scikit-learn 

In [6]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,NamePrefix
0,1,0,3,male,Student,1,0,1_quartile,N,Mr.
1,2,1,1,female,Adult,1,0,4_quartile,C,Mrs.
2,3,1,3,female,Young Adult,0,0,1_quartile,N,Miss.
3,4,1,1,female,Young Adult,1,0,4_quartile,C,Mrs.
4,5,0,3,male,Young Adult,0,0,2_quartile,N,Mr.


In [17]:
features = ['Fare','Sex', 'Age', 'Cabin','NamePrefix']

In [18]:
def LE(df_train,df_test, features):
    df_combined = pd.concat([df_train[features], df_test[features]])
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return  df_train, df_test

In [19]:
data_train, data_test = LE(data_train,data_test, features)

In [20]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,NamePrefix
0,1,0,3,1,4,1,0,0,7,19
1,2,1,1,0,0,1,0,3,2,20
2,3,1,3,0,7,0,0,0,7,16
3,4,1,1,0,7,1,0,3,2,20
4,5,0,3,1,7,0,0,1,7,19


## OneHotEncoder

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,NamePrefix
0,1,0,3,1,4,1,0,0,"(0, 7)\t1.0\n (1, 2)\t1.0\n (2, 7)\t1.0\n ...","(0, 17)\t1.0\n (1, 18)\t1.0\n (2, 14)\t1.0..."
1,2,1,1,0,0,1,0,3,"(0, 7)\t1.0\n (1, 2)\t1.0\n (2, 7)\t1.0\n ...","(0, 17)\t1.0\n (1, 18)\t1.0\n (2, 14)\t1.0..."
2,3,1,3,0,7,0,0,0,"(0, 7)\t1.0\n (1, 2)\t1.0\n (2, 7)\t1.0\n ...","(0, 17)\t1.0\n (1, 18)\t1.0\n (2, 14)\t1.0..."
3,4,1,1,0,7,1,0,3,"(0, 7)\t1.0\n (1, 2)\t1.0\n (2, 7)\t1.0\n ...","(0, 17)\t1.0\n (1, 18)\t1.0\n (2, 14)\t1.0..."
4,5,0,3,1,7,0,0,1,"(0, 7)\t1.0\n (1, 2)\t1.0\n (2, 7)\t1.0\n ...","(0, 17)\t1.0\n (1, 18)\t1.0\n (2, 14)\t1.0..."


In [None]:
from sklearn.model_selection import train_test_split
X_all = data_train.drop(['Survived', 'PassengerId'], axis=1)
y_all = data_train['Survived']
size_test = 0.20
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=size_test, random_state=17)

# SVM

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
acc_scorer = make_scorer(accuracy_score)

In [None]:
from sklearn.svm import LinearSVC 

In [None]:
clf_SVC = LinearSVC(random_state=17)

In [None]:
parameters = {'C': [0.0001, 0.001, 0.01, 0.1], 
              'kernel': ['linear', 'poly','rbf','sigmoid' ], 
             }

In [None]:
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

In [None]:
SVC_best = grid_obj.best_estimator_

In [None]:
clf_SVC.fit(X_train, y_train)

In [None]:
y_predictions = clf_SVC.predict(X_valid)
print(accuracy_score(y_valid, y_predictions)) 

## SGD

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
clf_sgd = SGDClassifier(random_state = 17, max_iter = 1000, tol = 1e-4)

In [None]:
parameters = {'loss': [ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber'], 
              'penalty': ['l1', 'l2','elasticnet'], 
             }

In [None]:
grid_obj = GridSearchCV(clf_sgd, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

In [None]:
SGD_best = grid_obj.best_estimator_

In [None]:
SGD_best.fit(X_train, y_train)

In [None]:
y_predictions = SGD_best.predict(X_valid)
print(accuracy_score(y_valid, y_predictions)) 

## DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf_DTC = DecisionTreeClassifier(random_state = 17)

In [None]:
parameters = {'criterion': [ 'gini', 'entropy'], 
              'max_depth': list(np.arange(1,10,2)),
              'min_samples_leaf': list(np.arange(1,4,1))
             }

In [None]:
grid_obj = GridSearchCV(clf_DTC, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

In [None]:
DTC_best = grid_obj.best_estimator_

In [None]:
clf_DTC.fit(X_train, y_train)

In [None]:
y_predictions = clf_DTC.predict(X_valid)
print(accuracy_score(y_valid, y_predictions)) 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_RFC = RandomForestClassifier(random_state=17)

In [None]:
clf_RFC.fit(X_train, y_train)

In [None]:
y_predictions = clf_RFC.predict(X_valid)
print(accuracy_score(y_valid, y_predictions)) 

## GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
y_pred = clf_gnb.fit(X_train, y_train).predict(X_valid)

In [None]:
print(roc_auc_score(y_valid, y_pred)) 

## CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
clf_CB = CatBoostClassifier(iterations=100, learning_rate=1, depth=2, loss_function='Logloss')

In [None]:
clf_CB.fit(X_train, y_train);

In [None]:
y_pred = clf_CB.predict(X_valid)

In [None]:
print(roc_auc_score(y_valid, y_pred))

## LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf_LR = LogisticRegression(solver = 'liblinear', C = 1, random_state=17)

In [None]:
clf_LR.fit(X_train, y_train)

In [None]:
y_pred = clf_LR.predict(X_valid)

In [None]:
print(accuracy_score(y_valid, y_pred))

In [None]:
from sklearn import preprocessing

In [None]:
features = ['Fare', 'Age', 'NamePrefix']

In [None]:
def LE(df_train,df_test, features):
    df_combined = pd.concat([df_train[features], df_test[features]])
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return  df_train, df_test

In [None]:
data_train, data_test = LE(data_train,data_test, features)

## Вынесем целевую переменную и разделим на трен и валид часть

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_all = data_train.drop(['Survived', 'PassengerId'], axis=1)

In [None]:
y_all = data_train['Survived']

In [None]:
size_test = 0.20
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=size_test, random_state=17)

## Создание модели и настройка параметров

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
##классификатор
clf = RandomForestClassifier()

In [None]:
## выбор параметров
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': list(np.arange(1,20,4)), 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

In [None]:
## метрика модели
acc_scorer = make_scorer(accuracy_score)

In [None]:
## поиск параметров
%time
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

In [None]:
grid_obj.best_params_, grid_obj.best_estimator_

In [None]:
## выбираем лучшую модель
best_est = grid_obj.best_estimator_

In [None]:
#обучаем данные на лучшей модели
%time
best_est.fit(X_train, y_train)

In [None]:
predictions = best_est.predict(X_valid)
print(accuracy_score(y_valid, predictions))  

## со всеми кодированными признаками получилось  0,7877

# Export to csv

In [None]:
ids = data_test['PassengerId']
predictions = clf.predict(data_test.drop('PassengerId', axis=1))


output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
# output.to_csv('titanic-predictions.csv', index = False)
output.head()