In [392]:
import pandas as pd
import numpy as np
import os

cwd= os.getcwd() 
path = os.path.join(cwd,'data')

# Loading data

In [395]:
df_train = pd.read_csv(os.path.join(path,'train.csv')) 
df_test = pd.read_csv(os.path.join(path,'test.csv')) 

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [397]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Preparing data

### Deleting unimportant features

In [401]:
def clean_data (df_train, df_test, columns_to_delete):
    for col in columns_to_delete:
        df_train.drop(col, axis=1, inplace=True)
        df_test.drop(col, axis=1, inplace=True)

In [403]:
columns_to_delete = ['Name', 'PassengerId', 'Ticket', 'Cabin']
clean_data(df_train, df_test, columns_to_delete)

In [405]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


### Clearing Nan values

In [408]:
df_train.info()
print("\n")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-nu

In [410]:
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].median())

# remaining features have a small amount of Nan, so we can just drop rows that have them
df_train = df_train.dropna()
df_test = df_test.dropna()

In [412]:
df_train.info()
print("\n")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 417 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    417 non-null    int64  
 1   Sex       417 non-null    object 
 2   Age       417 non-null    float64
 3   SibSp     417 non-null    int64  
 4   Parch     417 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  417 non-null    obje

### Encoding features

In [415]:
col_to_encode = ['Sex','Embarked']
from sklearn.preprocessing import LabelEncoder

def label_encoder(df_train, df_test, col_to_encode):
    df_train = df_train.copy()
    df_test = df_test.copy() 
    
    for col in col_to_encode:
        le = LabelEncoder().fit(df_train[col])
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])
    return df_train, df_test
    
df_train, df_test = label_encoder(df_train, df_test, col_to_encode)

In [417]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
888,0,3,0,28.0,1,2,23.4500,2
889,1,1,1,26.0,0,0,30.0000,0


### Split

In [420]:
from sklearn.model_selection import train_test_split

X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

### Normalize

In [423]:
from sklearn.preprocessing import MinMaxScaler

def normalize(X_train, X_test):
    scaler= MinMaxScaler().fit(X_train)
    X_train_scaled= scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled 

X_train_scaled, X_test_scaled  = normalize(X_train, X_test)

# Train

In [426]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [428]:
def run_GridSearchCV(clf, grid_values, X_train_scaled, y_train, X_test_scaled, y_test):
    grid_clf = GridSearchCV(clf, param_grid = grid_values, scoring = 'accuracy')
    grid_clf.fit(X_train_scaled, y_train)
    print('Grid best parameter: ', grid_clf.best_params_) 
    print('Grid best score: ', grid_clf.best_score_)
    test_score= grid_clf.score(X_test_scaled, y_test)
    print("test score = {}".format(test_score))


# Logistic Regression

In [431]:
print ('\nLogisticRegression')
clf = LogisticRegression(max_iter = 500)
grid_values = {'C': [0.005, 0.01,0.1, 1, 100, 10000]}
run_GridSearchCV(clf, grid_values, X_train_scaled, y_train, X_test_scaled, y_test)


LogisticRegression
Grid best parameter:  {'C': 0.1}
Grid best score:  0.79581416227135
test score = 0.7982062780269058


# Decision Tree Classifier

In [434]:
print ('\nDecisionTreeClassifier')
clf = DecisionTreeClassifier()       
grid_values = {'max_depth': [2,5,7, 20, 50]}
run_GridSearchCV(clf, grid_values, X_train_scaled, y_train, X_test_scaled, y_test)


DecisionTreeClassifier
Grid best parameter:  {'max_depth': 7}
Grid best score:  0.8167321288295366
test score = 0.820627802690583


# RandomForestClassifier

In [437]:
print ('\nRandomForestClassifier.')
clf = RandomForestClassifier()       
grid_values = {'n_estimators': [20,50,100]}
run_GridSearchCV(clf, grid_values, X_train_scaled, y_train, X_test_scaled, y_test)


RandomForestClassifier.
Grid best parameter:  {'n_estimators': 20}
Grid best score:  0.8122657389743015
test score = 0.7892376681614349


# XGBClassifier

In [440]:
print ('\nxgboost.')
clf = XGBClassifier()
grid_values = {'n_estimators': [50, 100],
               'max_depth': [2, 3],
               'learning_rate': [0.05, 0.1]}
run_GridSearchCV(clf, grid_values, X_train_scaled, y_train, X_test_scaled, y_test)


xgboost.
Grid best parameter:  {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}
Grid best score:  0.815329368196611
test score = 0.8116591928251121


# SVC

In [443]:
print ('\nSVC_poly')
clf = SVC()           
grid_values = {'C': [0.01, 0.1, 1], 'kernel': ['poly', 'rbf']}
run_GridSearchCV(clf, grid_values, X_train_scaled, y_train, X_test_scaled, y_test)


SVC_poly
Grid best parameter:  {'C': 1, 'kernel': 'poly'}
Grid best score:  0.8183144428234765
test score = 0.7937219730941704


# GaussianNB

In [446]:
print ('\nGaussianNB')
clf =  GaussianNB().fit(X_train_scaled, y_train)
train_score = clf.score(X_train_scaled, y_train)
print("train score= {}".format(train_score))
test_score = clf.score(X_test_scaled, y_test)
print("train score= {}".format(test_score))


GaussianNB
train score= 0.7957957957957958
train score= 0.7802690582959642


# Best clasifier in our case is XGBClassifier

In [465]:
clf = XGBClassifier(learning_rate = 0.1,
                   max_depth = 3,
                   n_estimators = 100).fit(X_train_scaled, y_train)

y_predict = clf.predict(df_test)
print(y_predict)


[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0]


In [469]:
df_test['Survived'] = y_predict
df_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,1,34.5,0,0,7.8292,1,0
1,3,0,47.0,1,0,7.0000,2,0
2,2,1,62.0,0,0,9.6875,1,0
3,3,1,27.0,0,0,8.6625,2,0
4,3,0,22.0,1,1,12.2875,2,0
...,...,...,...,...,...,...,...,...
413,3,1,27.0,0,0,8.0500,2,0
414,1,0,39.0,0,0,108.9000,0,1
415,3,1,38.5,0,0,7.2500,2,0
416,3,1,27.0,0,0,8.0500,2,0
