## Project: Titanic Kaggle Competition / Tutorial
## Name: Minghao Gong
## Date: 03/14/2021

In [16]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import os

In [17]:
os.chdir('/Volumes/Documents/Anaconda_Doc/SoftwareMethods/Tdata/Titanic')

In [18]:
file = './Data/titanic.csv'

In [19]:
df = pd.read_csv(file)

In [20]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      887 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    int64  
 6   Parents/Children Aboard  887 non-null    int64  
 7   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


### Data processing

In [22]:
# Delete name column
df.drop(['Name'],inplace=True,axis=1)

In [23]:
# One-hot encoding the category feature Sex
l = LabelEncoder()
ls = l.fit_transform(df['Sex']).reshape(-1,1)
o = OneHotEncoder()
os = pd.DataFrame(o.fit_transform(ls).toarray(),index=df.index)
    
df = pd.concat([df,os],axis=1)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Sex                      887 non-null    object 
 3   Age                      887 non-null    float64
 4   Siblings/Spouses Aboard  887 non-null    int64  
 5   Parents/Children Aboard  887 non-null    int64  
 6   Fare                     887 non-null    float64
 7   0                        887 non-null    float64
 8   1                        887 non-null    float64
dtypes: float64(4), int64(4), object(1)
memory usage: 62.5+ KB


In [25]:
# Delete Sex column
df.drop(['Sex'],inplace=True,axis=1)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Age                      887 non-null    float64
 3   Siblings/Spouses Aboard  887 non-null    int64  
 4   Parents/Children Aboard  887 non-null    int64  
 5   Fare                     887 non-null    float64
 6   0                        887 non-null    float64
 7   1                        887 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 55.6 KB


In [27]:
# The training set and test set are obtained in proportion
train_x,test_x,train_y,test_y = train_test_split(df.iloc[:,1:],df.iloc[:,0],test_size=0.3,random_state=2021)

### Decision Tree Model

In [28]:
# parameters Value range of the parameters for grid search
parameters  = {"criterion":("gini","entropy"), "splitter":("best","random"), "max_depth":[*range(1,10)]
              , "min_samples_leaf":[*range(1,50,5)]}

clf = DecisionTreeClassifier(random_state = 25)
GS = GridSearchCV(clf, parameters, cv=5) # At the same time, it can satisfy three functions: fit, score and cross validation.
GS = GS.fit(train_x, train_y)

In [29]:
# Optimal parameter value of decision tree model
GS.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'min_samples_leaf': 6,
 'splitter': 'best'}

In [30]:
y_pre = GS.predict(test_x)
df_pre = pd.DataFrame()
df_pre['pre'] = pd.Series(y_pre,dtype='int32')
df_pre

Unnamed: 0,pre
0,0
1,0
2,0
3,1
4,0
...,...
262,1
263,0
264,0
265,0


In [31]:
# The accuracy of the decision tree model
acc = accuracy_score(test_y,df_pre['pre'])
acc

0.7715355805243446

### Random Forest Model

In [32]:
# Grid search for optimal parameters of Random forest model

parameters  = {"n_estimators":[50,100,150], "max_depth":[*range(1,10)]}

rf = RandomForestClassifier()
GS2 = GridSearchCV(rf, parameters, cv=5) # # At the same time, it can satisfy three functions: fit, score and cross validation.
GS2 = GS2.fit(train_x, train_y)

In [33]:
# Optimal parameter value
GS2.best_params_

{'max_depth': 7, 'n_estimators': 150}

In [34]:
y_pre = GS2.predict(test_x)
df_pre = pd.DataFrame()
df_pre['pre'] = pd.Series(y_pre,dtype='int32')
df_pre

Unnamed: 0,pre
0,0
1,0
2,0
3,1
4,0
...,...
262,1
263,0
264,0
265,0


In [35]:
# The accuracy of the random forest model
acc = accuracy_score(test_y,df_pre['pre'])
acc

0.7827715355805244

### K-nearest Neighbor Model

In [36]:
# Grid search for optimal parameters of KNN model
parameters  = { "n_neighbors":[*range(1,10)]}

knn = KNeighborsClassifier()
GS3 = GridSearchCV(knn, parameters, cv=5) 
GS3 = GS3.fit(train_x, train_y)

In [37]:
# Optimal parameter value of KNN Model
GS3.best_params_

{'n_neighbors': 8}

In [38]:
y_pre = GS3.predict(test_x)
df_pre = pd.DataFrame()
df_pre['pre'] = pd.Series(y_pre,dtype='int32')
df_pre

Unnamed: 0,pre
0,0
1,0
2,0
3,0
4,1
...,...
262,0
263,0
264,0
265,1


In [39]:
# The accuracy of the KNN model
acc = accuracy_score(test_y,df_pre['pre'])
acc

0.6816479400749064