In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier

In [2]:
import warnings #Suppressing Warnings
warnings.filterwarnings('ignore')

# Model 3

In [3]:
train_data = pd.read_csv('train.csv')

In [4]:
train_data.drop(['Ticket'],axis=1,inplace=True)
train_data['Family_Size'] = train_data['SibSp']+train_data['Parch']
train_data['Cabin']=train_data['Cabin'].fillna('NA')
for s in range(len(train_data['Cabin'])):
    if train_data['Cabin'][s]=='NA':
        pass
    else:
        train_data['Cabin'][s] = 'C'
train_data['Embarked'].fillna(method='ffill',inplace=True)
train_data['Age'].fillna(method='ffill',inplace=True)
train_data.drop(['PassengerId','SibSp','Parch'],inplace=True,axis=1)
lb_pclass = LabelBinarizer()
pclass=lb_pclass.fit_transform(train_data['Pclass'])
pclass = pd.DataFrame(pclass,columns=lb_pclass.classes_).add_prefix('Pclass_')
train_data = pd.concat([train_data,pclass],axis=1)
lb_gender = LabelBinarizer()
gender=lb_gender.fit_transform(train_data['Sex'])
train_data['Sex'] = pd.DataFrame(gender)
lb_cabin = LabelBinarizer()
cabin=lb_cabin.fit_transform(train_data['Cabin'])
train_data['Cabin'] = pd.DataFrame(cabin)
lb_embarked = LabelBinarizer()
embarked=lb_embarked.fit_transform(train_data['Embarked'])
embarked = pd.DataFrame(embarked,columns=lb_embarked.classes_).add_prefix('Embarked_')
train_data = pd.concat([train_data,embarked],axis=1)

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    int32  
 4   Age          891 non-null    float64
 5   Fare         891 non-null    float64
 6   Cabin        891 non-null    int32  
 7   Embarked     891 non-null    object 
 8   Family_Size  891 non-null    int64  
 9   Pclass_1     891 non-null    int32  
 10  Pclass_2     891 non-null    int32  
 11  Pclass_3     891 non-null    int32  
 12  Embarked_C   891 non-null    int32  
 13  Embarked_Q   891 non-null    int32  
 14  Embarked_S   891 non-null    int32  
dtypes: float64(2), int32(8), int64(3), object(2)
memory usage: 76.7+ KB


In [7]:
train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",1,22.0,7.25,1,S,1,0,0,1,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,0,C,1,1,0,0,1,0,0
2,1,3,"Heikkinen, Miss. Laina",0,26.0,7.925,1,S,0,0,0,1,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,0,S,1,1,0,0,0,0,1
4,0,3,"Allen, Mr. William Henry",1,35.0,8.05,1,S,0,0,0,1,0,0,1


In [8]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [9]:
import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
#     print(big_string)
    return np.nan
train_data['Title']=train_data['Name'].map(lambda x: substrings_in_string(x, title_list))

In [12]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']==1:
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
train_data['Title']=train_data.apply(replace_titles, axis=1)

In [13]:
train_data['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master'], dtype=object)

In [14]:
lb_Title = LabelBinarizer()
Title=lb_Title.fit_transform(train_data['Title'])
Title = pd.DataFrame(Title,columns=lb_Title.classes_).add_prefix('Title_')
train_data = pd.concat([train_data,Title],axis=1)

In [15]:
train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,0,3,"Braund, Mr. Owen Harris",1,22.0,7.25,1,S,1,0,0,1,0,0,1,Mr,0,0,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,0,C,1,1,0,0,1,0,0,Mrs,0,0,0,1
2,1,3,"Heikkinen, Miss. Laina",0,26.0,7.925,1,S,0,0,0,1,0,0,1,Miss,0,1,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,0,S,1,1,0,0,0,0,1,Mrs,0,0,0,1
4,0,3,"Allen, Mr. William Henry",1,35.0,8.05,1,S,0,0,0,1,0,0,1,Mr,0,0,1,0


In [16]:
train_data.drop(['Name','Title'],axis=1,inplace=True)

In [21]:
train_data.drop(['Embarked','Pclass'],axis=1,inplace=True)

In [70]:
y = train_data['Survived']
X = train_data.drop(['Survived'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [71]:
LGBM = LGBMClassifier(max_depth=6,learning_rate=0.1,n_estimators=100)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LGBM, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LGBM_fit_time = scores['fit_time'].mean()
LGBM_score_time = scores['score_time'].mean()
LGBM_accuracy = scores['test_accuracy'].mean()
LGBM_precision = scores['test_precision_macro'].mean()
LGBM_recall = scores['test_recall_macro'].mean()
LGBM_f1 = scores['test_f1_weighted'].mean()
LGBM_roc = scores['test_roc_auc'].mean()

In [72]:
LGBM_accuracy

0.8202762923351159

In [73]:
LGBM = LGBMClassifier(max_depth=5,learning_rate=0.075,n_estimators=100,boosting_type='gbdt',num_leaves=25)

In [74]:
LGBM.fit(X_train,y_train)
y_pred=LGBM.predict(X_test)
accuracy_score(y_pred,y_test)

0.874439461883408

In [64]:
test = pd.read_csv('test.csv')

In [65]:
test.drop(['Ticket'],axis=1,inplace=True)
test['Family_Size'] = test['SibSp']+test['Parch']
test['Cabin']=test['Cabin'].fillna('NA')
for s in range(len(test['Cabin'])):
    if test['Cabin'][s]=='NA':
        pass
    else:
        test['Cabin'][s] = 'C'
test['Embarked'].fillna(method='ffill',inplace=True)
test['Age'].fillna(method='ffill',inplace=True)
test.drop(['PassengerId','SibSp','Parch'],inplace=True,axis=1)
lb_pclass = LabelBinarizer()
pclass=lb_pclass.fit_transform(test['Pclass'])
pclass = pd.DataFrame(pclass,columns=lb_pclass.classes_).add_prefix('Pclass_')
test = pd.concat([test,pclass],axis=1)
lb_gender = LabelBinarizer()
gender=lb_gender.fit_transform(test['Sex'])
test['Sex'] = pd.DataFrame(gender)
lb_cabin = LabelBinarizer()
cabin=lb_cabin.fit_transform(test['Cabin'])
test['Cabin'] = pd.DataFrame(cabin)
lb_embarked = LabelBinarizer()
embarked=lb_embarked.fit_transform(test['Embarked'])
embarked = pd.DataFrame(embarked,columns=lb_embarked.classes_).add_prefix('Embarked_')
test = pd.concat([test,embarked],axis=1)
test.drop(['Embarked'],axis=1,inplace=True)

In [66]:
test['Title']=test['Name'].map(lambda x: substrings_in_string(x, title_list))
test['Title']=test.apply(replace_titles, axis=1)
lb_Title = LabelBinarizer()
Title=lb_Title.fit_transform(test['Title'])
Title = pd.DataFrame(Title,columns=lb_Title.classes_).add_prefix('Title_')
test = pd.concat([test,Title],axis=1)
test.drop(['Name','Title'],axis=1,inplace=True)

In [67]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,3,1,34.5,7.8292,1,0,0,0,1,0,1,0,0,0,1,0
1,3,0,47.0,7.0,1,1,0,0,1,0,0,1,0,0,0,1
2,2,1,62.0,9.6875,1,0,0,1,0,0,1,0,0,0,1,0
3,3,1,27.0,8.6625,1,0,0,0,1,0,0,1,0,0,1,0
4,3,0,22.0,12.2875,1,2,0,0,1,0,0,1,0,0,0,1


In [75]:
test.drop(['Pclass'],axis=1,inplace=True)

In [76]:
test.head()

Unnamed: 0,Sex,Age,Fare,Cabin,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,1,34.5,7.8292,1,0,0,0,1,0,1,0,0,0,1,0
1,0,47.0,7.0,1,1,0,0,1,0,0,1,0,0,0,1
2,1,62.0,9.6875,1,0,0,1,0,0,1,0,0,0,1,0
3,1,27.0,8.6625,1,0,0,0,1,0,0,1,0,0,1,0
4,0,22.0,12.2875,1,2,0,0,1,0,0,1,0,0,0,1


In [77]:
LGBM.fit(X,y)
Y_test = LGBM.predict(test)

In [80]:
Y_test = pd.DataFrame(Y_test)

In [81]:
test2 = pd.read_csv('test.csv')

In [82]:
Y_test = pd.concat([test2['PassengerId'],Y_test],axis=1)

In [84]:
Y_test.rename(columns={0: 'Survived'},inplace=True)

In [129]:
Y_test.to_csv('submit4.csv',index=False)

# Model 4

In [97]:
train_data = pd.read_csv('train.csv')

In [98]:
train_data.drop(['Ticket'],axis=1,inplace=True)
train_data['Family_Size'] = train_data['SibSp']+train_data['Parch']
train_data['Cabin']=train_data['Cabin'].fillna('NA')
for s in range(len(train_data['Cabin'])):
    if train_data['Cabin'][s]=='NA':
        pass
    else:
        train_data['Cabin'][s] = 'C'
train_data['Embarked'].fillna(method='ffill',inplace=True)
train_data['Age'].fillna(method='ffill',inplace=True)
train_data.drop(['PassengerId','SibSp','Parch'],inplace=True,axis=1)
lb_pclass = LabelBinarizer()
pclass=lb_pclass.fit_transform(train_data['Pclass'])
pclass = pd.DataFrame(pclass,columns=lb_pclass.classes_).add_prefix('Pclass_')
train_data = pd.concat([train_data,pclass],axis=1)
lb_gender = LabelBinarizer()
gender=lb_gender.fit_transform(train_data['Sex'])
train_data['Sex'] = pd.DataFrame(gender)
lb_cabin = LabelBinarizer()
cabin=lb_cabin.fit_transform(train_data['Cabin'])
train_data['Cabin'] = pd.DataFrame(cabin)
lb_embarked = LabelBinarizer()
embarked=lb_embarked.fit_transform(train_data['Embarked'])
embarked = pd.DataFrame(embarked,columns=lb_embarked.classes_).add_prefix('Embarked_')
train_data = pd.concat([train_data,embarked],axis=1)
train_data.drop(['Embarked'],axis=1,inplace=True)

In [99]:
train_data['Title']=train_data['Name'].map(lambda x: substrings_in_string(x, title_list))
train_data['Title']=train_data.apply(replace_titles, axis=1)

In [100]:
le_Title = LabelEncoder()
Title=le_Title.fit_transform(train_data['Title'])
train_data.drop(['Pclass'],axis=1,inplace=True)

In [101]:
train_data['Title'] = Title

In [102]:
train_data.head()

Unnamed: 0,Survived,Name,Sex,Age,Fare,Cabin,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title
0,0,"Braund, Mr. Owen Harris",1,22.0,7.25,1,1,0,0,1,0,0,1,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,0,1,1,0,0,1,0,0,3
2,1,"Heikkinen, Miss. Laina",0,26.0,7.925,1,0,0,0,1,0,0,1,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,0,1,1,0,0,0,0,1,3
4,0,"Allen, Mr. William Henry",1,35.0,8.05,1,0,0,0,1,0,0,1,2


In [103]:
train_data.drop(['Name'],axis=1,inplace=True)

In [104]:
train_data

Unnamed: 0,Survived,Sex,Age,Fare,Cabin,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title
0,0,1,22.0,7.2500,1,1,0,0,1,0,0,1,2
1,1,0,38.0,71.2833,0,1,1,0,0,1,0,0,3
2,1,0,26.0,7.9250,1,0,0,0,1,0,0,1,1
3,1,0,35.0,53.1000,0,1,1,0,0,0,0,1,3
4,0,1,35.0,8.0500,1,0,0,0,1,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,27.0,13.0000,1,0,0,1,0,0,0,1,2
887,1,0,19.0,30.0000,0,0,1,0,0,0,0,1,1
888,0,0,19.0,23.4500,1,3,0,0,1,0,0,1,1
889,1,1,26.0,30.0000,0,0,1,0,0,1,0,0,2


In [105]:
y = train_data['Survived']
X = train_data.drop(['Survived'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [106]:
LGBM = LGBMClassifier(max_depth=6,learning_rate=0.1,n_estimators=100)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LGBM, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LGBM_fit_time = scores['fit_time'].mean()
LGBM_score_time = scores['score_time'].mean()
LGBM_accuracy = scores['test_accuracy'].mean()
LGBM_precision = scores['test_precision_macro'].mean()
LGBM_recall = scores['test_recall_macro'].mean()
LGBM_f1 = scores['test_f1_weighted'].mean()
LGBM_roc = scores['test_roc_auc'].mean()

In [107]:
LGBM_accuracy

0.814260249554367

In [120]:
LGBM = LGBMClassifier(max_depth=5,learning_rate=0.075,n_estimators=100,boosting_type='gbdt',num_leaves=31)
LGBM.fit(X_train,y_train)
y_pred=LGBM.predict(X_test)
accuracy_score(y_pred,y_test)

0.8834080717488789

In [121]:
test = pd.read_csv('test.csv')
test.drop(['Ticket'],axis=1,inplace=True)
test['Family_Size'] = test['SibSp']+test['Parch']
test['Cabin']=test['Cabin'].fillna('NA')
for s in range(len(test['Cabin'])):
    if test['Cabin'][s]=='NA':
        pass
    else:
        test['Cabin'][s] = 'C'
test['Embarked'].fillna(method='ffill',inplace=True)
test['Age'].fillna(method='ffill',inplace=True)
test.drop(['PassengerId','SibSp','Parch'],inplace=True,axis=1)
lb_pclass = LabelBinarizer()
pclass=lb_pclass.fit_transform(test['Pclass'])
pclass = pd.DataFrame(pclass,columns=lb_pclass.classes_).add_prefix('Pclass_')
test = pd.concat([test,pclass],axis=1)
lb_gender = LabelBinarizer()
gender=lb_gender.fit_transform(test['Sex'])
test['Sex'] = pd.DataFrame(gender)
lb_cabin = LabelBinarizer()
cabin=lb_cabin.fit_transform(test['Cabin'])
test['Cabin'] = pd.DataFrame(cabin)
lb_embarked = LabelBinarizer()
embarked=lb_embarked.fit_transform(test['Embarked'])
embarked = pd.DataFrame(embarked,columns=lb_embarked.classes_).add_prefix('Embarked_')
test = pd.concat([test,embarked],axis=1)
test.drop(['Embarked'],axis=1,inplace=True)

In [122]:
test['Title']=test['Name'].map(lambda x: substrings_in_string(x, title_list))
test['Title']=test.apply(replace_titles, axis=1)
le_Title = LabelEncoder()
Title=le_Title.fit_transform(test['Title'])
test.drop(['Pclass'],axis=1,inplace=True)
test['Title'] = Title

In [123]:
test.drop(['Name'],axis=1,inplace=True)

In [124]:
test.head()

Unnamed: 0,Sex,Age,Fare,Cabin,Family_Size,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title
0,1,34.5,7.8292,1,0,0,0,1,0,1,0,2
1,0,47.0,7.0,1,1,0,0,1,0,0,1,3
2,1,62.0,9.6875,1,0,0,1,0,0,1,0,2
3,1,27.0,8.6625,1,0,0,0,1,0,0,1,2
4,0,22.0,12.2875,1,2,0,0,1,0,0,1,3


In [125]:
LGBM.fit(X,y)
Y_test = LGBM.predict(test)

In [126]:
Y_test = pd.DataFrame(Y_test)
Y_test = pd.concat([test2['PassengerId'],Y_test],axis=1)
Y_test.rename(columns={0: 'Survived'},inplace=True)

In [127]:
Y_test.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [130]:
Y_test.to_csv('submit5.csv',index=False)