In [1]:
import sys
import pandas as pd
import numpy as np
import sklearn

In [2]:
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.base import clone

In [3]:
import warnings
warnings.filterwarnings('ignore')

import IPython
from IPython.display import display

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use('seaborn')

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale, StandardScaler

from sklearn import (
    ensemble, 
    gaussian_process,
    linear_model,
    naive_bayes,
    neighbors,
    svm,
    tree, 
    discriminant_analysis
)

from xgboost import XGBClassifier

In [6]:
from sklearn.model_selection import KFold, GridSearchCV, cross_validate, ParameterGrid
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier

In [7]:
raw_train_data = pd.read_csv('data\\train.csv')
raw_test_data = pd.read_csv('data\\test.csv')

In [8]:
scaler = StandardScaler().fit(raw_train_data[['Age', 'Fare']])
def ownprocess(original_DF, com = True):
    DF = original_DF.copy()
    
    age_map = [int(DF[DF.Pclass==c].Age.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Age.isnull() & (DF.Pclass==c), 'Age'
        ] = age_map[c-1]
    
    
    
    DF['Embarked'].fillna(DF.Embarked.mode()[0], inplace = True)
    
    
    Fare_map = [int(DF[DF.Pclass==c].Fare.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Fare.isnull() & (DF.Pclass==c), 'Fare'
        ] = Fare_map[c-1]
        
    DF[['Age', 'Fare']] = scaler.transform(DF[['Age', 'Fare']])
    
        
    DF['FamilySize'] = DF.SibSp + DF.Parch
    
    
    sex_mapping = {'male': 1, 'female': 0}
    DF['Sex'] = DF.Sex.map(sex_mapping)
    
    
    
    DF['Title'] = DF['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    titlefrq = DF.Title.value_counts() < 10
    DF['Title'] = DF['Title'].apply(lambda s: 'Misc' if titlefrq.loc[s] == True else s)
    
    
    DF['Cabin'] = DF.Cabin.notnull().astype(int)
    
    DF = DF[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'Title']]
    
    if not com:
        return DF
    
    DF = pd.get_dummies(DF)
    
    return DF  

In [9]:
def process(original_DF, col='ori'):
    DF = original_DF.copy()
    
    #COMPLETE    
    DF['Age'].fillna(DF.Age.median(), inplace = True)    
    DF['Embarked'].fillna(DF.Embarked.mode()[0], inplace = True)    
    DF['Fare'].fillna(DF.Fare.median(), inplace = True)    
    DF['FamilySize'] = DF.SibSp + DF.Parch
    DF['isAlone'] = DF.SibSp + DF.Parch == 0
    DF['IsAlone'] = DF.isAlone.astype(int)
    
    #CREATE
    DF['FareBin'] = pd.qcut(DF.Fare, 4)
    DF['AgeBin'] = pd.qcut(DF.Age, 4)
    
    DF['Title'] = DF['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    titlefrq = DF.Title.value_counts() < 10
    DF['Title'] = DF['Title'].apply(lambda s: 'Misc' if titlefrq.loc[s] == True else s)
    
    
    DF['FareBin'] = pd.qcut(DF.Fare, 4)
    DF['AgeBin'] = pd.cut(DF.Age, 5)
    
    cols1 = ['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin']
    label = LabelEncoder()
    for c in cols1:
        DF[c+'_Code'] = label.fit_transform(DF[c])
        
        
    drop_columns = ['PassengerId', 'Cabin', 'Ticket']
    DF.drop(drop_columns, axis=1, inplace = True)   
    
    if col == 'all':
        columns = DF.columns
    elif col == 'ori':
        columns = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
    elif col == 'calc':
        columns = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare']
    elif col == 'bin':
        columns = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
    elif col == 'dum':
        return pd.get_dummies(DF[['Sex','Pclass', 'Embarked', 'Title','SibSp', 
                                  'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']])
    return DF[columns]

In [10]:
train_data = process(raw_train_data, 'bin')
test_data = process(raw_test_data)

data1 = train_data.copy()
target = raw_train_data.Survived
data1['Survived'] = target

In [11]:
data1.corr().Survived

Sex_Code        -0.543351
Pclass          -0.338481
Embarked_Code   -0.167675
Title_Code      -0.083292
FamilySize       0.016639
AgeBin_Code     -0.043800
FareBin_Code     0.299357
Survived         1.000000
Name: Survived, dtype: float64

In [12]:
O = ownprocess(raw_train_data)
O1 = ownprocess(raw_train_data, False)
O1['Survived'] = target
ownscaler = StandardScaler().fit(O)
# O = ownscaler.transform(O)

In [13]:
MLA = [
    ensemble.AdaBoostClassifier(), 
    ensemble.BaggingClassifier(), 
    ensemble.ExtraTreesClassifier(), 
    ensemble.GradientBoostingClassifier(), 
    ensemble.RandomForestClassifier(), 
    
    gaussian_process.GaussianProcessClassifier(), 
    
    linear_model.LogisticRegressionCV(), 
    linear_model.PassiveAggressiveClassifier(), 
    linear_model.RidgeClassifierCV(), 
    linear_model.SGDClassifier(), 
    linear_model.Perceptron(), 
    
    naive_bayes.BernoulliNB(), 
    naive_bayes.GaussianNB(), 
    
    neighbors.KNeighborsClassifier(), 
    
    svm.SVC(probability=True), 
    svm.NuSVC(probability=True), 
    svm.LinearSVC(), 
    
    tree.DecisionTreeClassifier(), 
    tree.ExtraTreeClassifier(), 
    
    discriminant_analysis.LinearDiscriminantAnalysis(), 
    discriminant_analysis.QuadraticDiscriminantAnalysis(), 
    
    XGBClassifier()
]

In [14]:
# parameter tuning
# mlpc_parameters = {
#     'activation': ['relu', 'logistic'], 
#     'solver': ['lbfgs', 'adam'], 
#     'alpha': 10.0 ** -np.arange(1, 7),
#     'warm_start': [True]
# }

# knn_parameters = {
#     'n_neighbors': range(1, 20),
#     'weights': ['uniform', 'distance']
# }

# svc_parameters = {
#     'class_weight': ['balanced', None],
#     'gamma': ['scale', 'auto']    
# }

# sgd_parameters = {
#     'alpha': 10.0**-np.arange(1,7),
#     'class_weight': [None, 'balanced'],
# }

# rf_parameters = {
#     'n_estimators': [1000], 
#     'class_weight': ['balanced', None]
# }

# mlpc_models = [MLPClassifier(**params) for params in ParameterGrid(mlpc_parameters)]
# knn_models = [KNeighborsClassifier(**params) for params in ParameterGrid(knn_parameters)]
# svc_models = [SVC(**params) for params in ParameterGrid(svc_parameters)]
# sgd_models = [SGDClassifier(**params) for params in ParameterGrid(sgd_parameters)]
# rf_models = [RandomForestClassifier(**params) for params in ParameterGrid(rf_parameters)]

# MLA = mlpc_models + knn_models + svc_models + sgd_models + rf_models

In [15]:
metrics = [
    'fit_time', 
    'test_score', 
]
def cv(alg, DF, target):
    name = alg.__class__.__name__
    params = str(alg.get_params())
    
    score = pd.DataFrame(cross_validate(alg, DF, target))
#     print (score)
    
    df = pd.DataFrame({
        name: [params] + [score[i].mean() for i in metrics] + [score['test_score'].std()]
    })
    
    return df.transpose()
    

In [16]:
M = pd.DataFrame()
for alg in MLA:
    M = M.append(cv(alg, train_data, target))
    
M = M.rename(columns = {i: c for i, c in enumerate(['params']+metrics+['std_test_score'])})
M = M.sort_values('test_score', ascending = False)

In [17]:
M.head()

Unnamed: 0,params,fit_time,test_score,std_test_score
SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.0112005,0.8305,0.0228267
SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.0114,0.8305,0.0251302
MLPClassifier,"{'activation': 'relu', 'alpha': 1e-05, 'batch_...",1.12857,0.826044,0.016776
MLPClassifier,"{'activation': 'relu', 'alpha': 1e-06, 'batch_...",1.12159,0.824914,0.0211875
MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch...",1.13912,0.824914,0.0175181


In [18]:
Mown = pd.DataFrame()
for alg in MLA:
    Mown = Mown.append(cv(alg, O, target))
    
Mown = Mown.rename(columns = {i: c for i, c in enumerate(['params']+metrics+['std_test_score'])})
Mown = Mown.sort_values('test_score', ascending = False)

In [19]:
O

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,FamilySize,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,-0.530377,-0.502445,0,1,0,0,1,0,0,0,1,0
1,1,0,0.571831,0.786845,1,1,1,0,0,0,0,0,0,1
2,3,0,-0.254825,-0.488854,0,0,0,0,1,0,0,1,0,0
3,1,0,0.365167,0.420730,1,1,0,0,1,0,0,0,0,1
4,3,1,0.365167,-0.486337,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,1,-0.185937,-0.386671,0,0,0,0,1,0,1,0,0,0
887,1,0,-0.737041,-0.044381,1,0,0,0,1,0,0,1,0,0
888,3,0,-0.323713,-0.176263,0,3,0,0,1,0,0,1,0,0
889,1,1,-0.254825,-0.044381,1,0,1,0,0,0,0,0,1,0


In [20]:
Mown.head(10)

Unnamed: 0,params,fit_time,test_score,std_test_score
MLPClassifier,"{'activation': 'relu', 'alpha': 0.001, 'batch_...",1.48027,0.835013,0.0256451
SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.0130003,0.833877,0.0257452
MLPClassifier,"{'activation': 'logistic', 'alpha': 1e-06, 'ba...",1.11117,0.832754,0.0285997
SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.0132124,0.832754,0.0263001
MLPClassifier,"{'activation': 'relu', 'alpha': 1e-06, 'batch_...",1.46378,0.831643,0.0261017
MLPClassifier,"{'activation': 'logistic', 'alpha': 0.0001, 'b...",1.04255,0.831636,0.027899
MLPClassifier,"{'activation': 'relu', 'alpha': 0.1, 'batch_si...",1.44328,0.830525,0.0200385
MLPClassifier,"{'activation': 'relu', 'alpha': 0.01, 'batch_s...",1.44929,0.830525,0.0200385
MLPClassifier,"{'activation': 'logistic', 'alpha': 0.1, 'batc...",1.2231,0.830513,0.0234189
MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch...",1.47312,0.829408,0.0255936


In [21]:
cols = ['Pclass', 'Sex', 'Cabin', 'Embarked', 'FamilySize', 'Title']

In [22]:
O1.corr().abs().sort_values('Survived', ascending = False).Survived

Survived      1.000000
Sex           0.543351
Pclass        0.338481
Cabin         0.316912
Fare          0.257307
Age           0.050118
FamilySize    0.016639
Name: Survived, dtype: float64

In [23]:
def tablo(feature, DF = train_data):
    df = DF[[feature, 'Survived']].groupby([feature])
    D = df.sum()
    D['Total'] = df.count()
    D['Part'] = df.mean()
    D['Demised'] = D.Total - D.Survived
    display (D)

In [26]:
Mown.head(20)

Unnamed: 0,params,fit_time,test_score,std_test_score
MLPClassifier,"{'activation': 'relu', 'alpha': 0.001, 'batch_...",1.48027,0.835013,0.0256451
SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.0130003,0.833877,0.0257452
MLPClassifier,"{'activation': 'logistic', 'alpha': 1e-06, 'ba...",1.11117,0.832754,0.0285997
SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.0132124,0.832754,0.0263001
MLPClassifier,"{'activation': 'relu', 'alpha': 1e-06, 'batch_...",1.46378,0.831643,0.0261017
MLPClassifier,"{'activation': 'logistic', 'alpha': 0.0001, 'b...",1.04255,0.831636,0.027899
MLPClassifier,"{'activation': 'relu', 'alpha': 0.1, 'batch_si...",1.44328,0.830525,0.0200385
MLPClassifier,"{'activation': 'relu', 'alpha': 0.01, 'batch_s...",1.44929,0.830525,0.0200385
MLPClassifier,"{'activation': 'logistic', 'alpha': 0.1, 'batc...",1.2231,0.830513,0.0234189
MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch...",1.47312,0.829408,0.0255936
