In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

'''
Original Runs:
LogisticRegression: 0.79
DecisionTrees: 0.78
RandomForest: 0.82
SVC: 0.81

Feature Engineering (Age and Fare bins):
LogisticRegression: 0.79
DecisionTrees: 0.79
RandomForest: 0.83
SVC: 0.79

Latest:
LogisticRegression: 0.80
DecisionTrees: 0.78
RandomForest: 0.82
SVC: 0.82

Title, Family_Size, train_test_split, and kfold
LogisticRegression: 
train: 0.82
validation: 0.79
DecisionTrees: 
train: 0.75
validation: 0.80
RandomForest: 
train: 0.80 
validation: 0.82
SVC: 
train: 0.82
validation: 0.81
'''

train_data = pd.read_csv('train.csv')
#id = train_data['PassengerId']
train_data.drop(['PassengerId','Ticket'], axis=1, inplace=True)

In [2]:
y_data = train_data[['Survived']]
X_data = train_data.drop(['Survived'], axis=1)

In [3]:
X_data['Title'] = X_data['Name'].str.split(', ', expand=True)[1].str.split('. ', expand=True)[0]
title_count = (X_data['Title'].value_counts() < 10)
X_data['Title'] = X_data['Title'].apply(lambda x: 'Misc' if title_count.loc[x] == True else x)

In [4]:
X_data = X_data.drop(['Name'], axis=1)
X_data['Embarked'] = X_data['Embarked'].fillna(X_data['Embarked'].mode()[0])
X_data['Age'] = X_data['Age'].fillna(X_data['Age'].mean())

In [5]:
X_data[['Sex']] = pd.get_dummies(X_data[['Sex']], prefix='Sex', drop_first=True)
X_data[['Embarked_Q', 'Embarked_C']] = pd.get_dummies(X_data[['Embarked']], prefix=['Embarked'], drop_first=True)
X_data.drop('Embarked', axis=1, inplace=True)

In [6]:
X_data['Age_bin'] = pd.qcut(X_data['Age'], 3, labels=['Young', 'Middle', 'Old'])
X_data['Fare_bin'] = pd.qcut(X_data['Fare'], 3, labels=['Low', 'Medium', 'High'])
X_data[['Age_Y', 'Age_O']] = pd.get_dummies(X_data[['Age_bin']], prefix='Age', drop_first=True)
X_data[['Fare_L', 'Fare_H']] = pd.get_dummies(X_data[['Fare_bin']], prefix='Fare', drop_first=True)

In [7]:
X_data['Family_Size'] = X_data['SibSp'] + X_data['Parch'] + 1
#X_data['IsAlone'] = (X_data['Family_Size']>1)
X_data = X_data.drop(['Fare_bin', 'Age_bin', 'SibSp', 'Parch'], axis=1)
#X_data['IsAlone'] = X_data['IsAlone'].apply(lambda x: 1 if x==True else 0)
#X_data.drop(['Age', 'Fare', 'Family_Size'], axis=1, inplace=True)


In [8]:
Titles = pd.get_dummies(X_data['Title'], prefix='Title', drop_first=True)
X_data = pd.concat([X_data, Titles], axis=1)

In [9]:
X_data.drop(['Title', 'Cabin'], axis=1, inplace=True)

In [75]:
X_data.to_csv('')

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_Q,Embarked_C,IsAlone,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0.0,0.0,0,1,1,0,0,1,0
1,1,0,0.0,3.0,0,0,1,0,0,0,1
2,3,0,0.0,1.0,0,1,0,0,1,0,0
3,1,0,0.0,3.0,0,1,1,0,0,0,1
4,3,1,0.0,1.0,0,1,0,0,0,1,0
5,3,1,0.0,1.0,1,0,0,0,0,1,0
6,1,1,0.0,3.0,0,1,0,0,0,1,0
7,3,1,0.0,2.0,0,1,1,0,0,0,0
8,3,0,0.0,1.0,0,1,1,0,0,0,1
9,2,0,0.0,2.0,0,0,1,0,0,0,1


In [10]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
X = MinMaxScaler().fit_transform(X_data.values)
y = y_data.values

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
lda = LinearDiscriminantAnalysis()
print(cross_val_score(lda, X_train, y_train, cv=10).mean())
lda.fit(X_train, y_train)
lda.score(X_test, y_test)

0.8040426238474762


0.8444444444444444

In [31]:
gbc = GradientBoostingClassifier()
print(cross_val_score(gbc, X_train, y_train, cv=10).mean())
gbc.fit(X_train, y_train)
gbc.score(X_test, y_test)

0.8128080559462416


0.8444444444444444

In [32]:
ada = AdaBoostClassifier()
print(cross_val_score(ada, X_train, y_train, cv=10).mean())
ada.fit(X_train, y_train)
ada.score(X_test, y_test)

0.795275629004532


0.8555555555555555

In [56]:
test = pd.read_csv('test.csv')
test.drop(['Ticket', 'Cabin'], axis=1, inplace=True)
test_df = test.copy()

In [57]:
test_df['Title'] = test_df['Name'].str.split(', ', expand=True)[1].str.split('. ', expand=True)[0]
title_count = (test_df['Title'].value_counts() < 10)
test_df['Title'] = test_df['Title'].apply(lambda x: 'Misc' if title_count.loc[x] == True else x)


In [58]:
id = test_df['PassengerId'].values
test_df = test_df.drop(['Name', 'PassengerId'], axis=1)
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())


In [59]:
test_df[['Sex']] = pd.get_dummies(test_df[['Sex']], prefix='Sex', drop_first=True)
test_df[['Embarked_Q', 'Embarked_C']] = pd.get_dummies(test_df[['Embarked']], prefix=['Embarked'], drop_first=True)
test_df.drop('Embarked', axis=1, inplace=True)


In [61]:
test_df['Family_Size'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['IsAlone'] = (test_df['Family_Size']>1)
test_df = test_df.drop(['SibSp', 'Parch'], axis=1)
test_df['IsAlone'] = test_df['IsAlone'].apply(lambda x: 1 if x==True else 0)
#X_data.drop(['Age', 'Fare', 'Family_Size'], axis=1, inplace=True)

In [62]:
Titles = pd.get_dummies(test_df['Title'], prefix='Title', drop_first=True)
test_df = pd.concat([test_df, Titles], axis=1)
test_df

Unnamed: 0,Pclass,Sex,Age,Fare,Title,Embarked_Q,Embarked_C,Family_Size,IsAlone,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,34.50000,7.8292,Mr,1,0,1,0,0,0,1,0
1,3,0,47.00000,7.0000,Mrs,0,1,2,1,0,0,0,1
2,2,1,62.00000,9.6875,Mr,1,0,1,0,0,0,1,0
3,3,1,27.00000,8.6625,Mr,0,1,1,0,0,0,1,0
4,3,0,22.00000,12.2875,Mrs,0,1,3,1,0,0,0,1
5,3,1,14.00000,9.2250,Mr,0,1,1,0,0,0,1,0
6,3,0,30.00000,7.6292,Miss,1,0,1,0,0,1,0,0
7,2,1,26.00000,29.0000,Mr,0,1,3,1,0,0,1,0
8,3,0,18.00000,7.2292,Mrs,0,0,1,0,0,0,0,1
9,3,1,21.00000,24.1500,Mr,0,1,3,1,0,0,1,0


In [67]:
test_df.info()
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
Pclass         418 non-null int64
Sex            418 non-null uint8
Age            418 non-null float64
Fare           417 non-null float64
Title          418 non-null object
Embarked_Q     418 non-null uint8
Embarked_C     418 non-null uint8
Family_Size    418 non-null int64
IsAlone        418 non-null int64
Title_Misc     418 non-null uint8
Title_Miss     418 non-null uint8
Title_Mr       418 non-null uint8
Title_Mrs      418 non-null uint8
dtypes: float64(2), int64(3), object(1), uint8(7)
memory usage: 22.5+ KB


In [68]:
test_df.loc[ test_df['Fare'] <= 7.91, 'Fare'] 						        = 0
test_df.loc[(test_df['Fare'] > 7.91) & (test_df['Fare'] <= 14.454), 'Fare'] = 1
test_df.loc[(test_df['Fare'] > 14.454) & (test_df['Fare'] <= 31), 'Fare']   = 2
test_df.loc[ test_df['Fare'] > 31, 'Fare'] 							        = 3
test_df['Fare'] = test_df['Fare'].astype(int)
    
# Mapping Age
test_df.loc[ test_df['Age'] <= 16, 'Age'] 					       = 0
test_df.loc[(test_df['Age'] > 16) & (test_df['Age'] <= 32), 'Age'] = 1
test_df.loc[(test_df['Age'] > 32) & (test_df['Age'] <= 48), 'Age'] = 2
test_df.loc[(test_df['Age'] > 48) & (test_df['Age'] <= 64), 'Age'] = 3
test_df.loc[ test_df['Age'] > 64, 'Age']                           = 4

# Feature Selection

In [72]:
test_df.drop(['Family_Size'], axis=1, inplace=True)
dec = DecisionTreeClassifier(random_state=48)

In [73]:
TX = MinMaxScaler().fit_transform(test_df.values)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
Pclass        418 non-null int64
Sex           418 non-null uint8
Age           418 non-null float64
Fare          418 non-null int32
Embarked_Q    418 non-null uint8
Embarked_C    418 non-null uint8
IsAlone       418 non-null int64
Title_Misc    418 non-null uint8
Title_Miss    418 non-null uint8
Title_Mr      418 non-null uint8
Title_Mrs     418 non-null uint8
dtypes: float64(1), int32(1), int64(2), uint8(7)
memory usage: 14.4 KB


In [74]:

res_ada = ada.predict(TX)
res_lda = lda.predict(TX)
res_gbc = gbc.predict(TX)
pd.DataFrame({'PassengerId': id, 'Survived': res_ada}).to_csv('preds_ada.csv', index=False)
pd.DataFrame({'PassengerId': id, 'Survived': res_lda}).to_csv('preds_lda.csv', index=False)
pd.DataFrame({'PassengerId': id, 'Survived': res_gbc}).to_csv('preds_gbc.csv', index=False)