In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier

In [2]:
import warnings #Suppressing Warnings
warnings.filterwarnings('ignore')

In [3]:
train_data = pd.read_csv('train.csv')

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_data.drop(['Name','Ticket'],axis=1,inplace=True)

In [6]:
train_data['Family_Size'] = train_data['SibSp']+train_data['Parch']

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Cabin        204 non-null    object 
 9   Embarked     889 non-null    object 
 10  Family_Size  891 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 76.7+ KB


In [8]:
train_data['Cabin']=train_data['Cabin'].fillna('NA')

In [9]:
for s in range(len(train_data['Cabin'])):
    if train_data['Cabin'][s]=='NA':
        pass
    else:
        train_data['Cabin'][s] = 'C'

In [10]:
train_data['Cabin'].unique()

array(['NA', 'C'], dtype=object)

In [11]:
train_data['Embarked'].fillna(method='ffill',inplace=True)

In [12]:
train_data['Age'].fillna(method='ffill',inplace=True)

In [13]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family_Size
0,1,0,3,male,22.0,1,0,7.25,,S,1
1,2,1,1,female,38.0,1,0,71.2833,C,C,1
2,3,1,3,female,26.0,0,0,7.925,,S,0
3,4,1,1,female,35.0,1,0,53.1,C,S,1
4,5,0,3,male,35.0,0,0,8.05,,S,0


In [14]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Cabin        891 non-null    object 
 9   Embarked     891 non-null    object 
 10  Family_Size  891 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 76.7+ KB


In [15]:
train_data.drop(['PassengerId','SibSp','Parch'],inplace=True,axis=1)

In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Sex          891 non-null    object 
 3   Age          891 non-null    float64
 4   Fare         891 non-null    float64
 5   Cabin        891 non-null    object 
 6   Embarked     891 non-null    object 
 7   Family_Size  891 non-null    int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [17]:
lb_pclass = LabelBinarizer()
pclass=lb_pclass.fit_transform(train_data['Pclass'])
pclass = pd.DataFrame(pclass,columns=lb_pclass.classes_).add_prefix('Pclass_')

In [18]:
train_data = pd.concat([train_data,pclass],axis=1)

In [19]:
lb_gender = LabelBinarizer()
gender=lb_gender.fit_transform(train_data['Sex'])

In [20]:
gender = pd.DataFrame(gender)

In [21]:
train_data['Sex'] = gender

In [22]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Family_Size,Pclass_1,Pclass_2,Pclass_3
0,0,3,1,22.0,7.25,,S,1,0,0,1
1,1,1,0,38.0,71.2833,C,C,1,1,0,0
2,1,3,0,26.0,7.925,,S,0,0,0,1
3,1,1,0,35.0,53.1,C,S,1,1,0,0
4,0,3,1,35.0,8.05,,S,0,0,0,1


In [23]:
lb_cabin = LabelBinarizer()
cabin=lb_cabin.fit_transform(train_data['Cabin'])

In [25]:
cabin = pd.DataFrame(cabin)

In [26]:
train_data = pd.concat([train_data,cabin],axis=1)

In [27]:
lb_embarked = LabelBinarizer()
embarked=lb_embarked.fit_transform(train_data['Embarked'])
embarked = pd.DataFrame(embarked,columns=lb_embarked.classes_).add_prefix('Embarked_')

In [28]:
train_data = pd.concat([train_data,embarked],axis=1)

In [29]:
train_data.drop(['Cabin','Embarked','Pclass'],axis=1,inplace=True)

In [30]:
train_data.head()

Unnamed: 0,Survived,Sex,Age,Fare,Family_Size,Pclass_1,Pclass_2,Pclass_3,0,Embarked_C,Embarked_Q,Embarked_S
0,0,1,22.0,7.25,1,0,0,1,1,0,0,1
1,1,0,38.0,71.2833,1,1,0,0,0,1,0,0
2,1,0,26.0,7.925,0,0,0,1,1,0,0,1
3,1,0,35.0,53.1,1,1,0,0,0,0,0,1
4,0,1,35.0,8.05,0,0,0,1,1,0,0,1


In [31]:
train_data.rename(columns={0:'Cabin'},inplace=True)

In [32]:
train_data.head()

Unnamed: 0,Survived,Sex,Age,Fare,Family_Size,Pclass_1,Pclass_2,Pclass_3,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,0,1,22.0,7.25,1,0,0,1,1,0,0,1
1,1,0,38.0,71.2833,1,1,0,0,0,1,0,0
2,1,0,26.0,7.925,0,0,0,1,1,0,0,1
3,1,0,35.0,53.1,1,1,0,0,0,0,0,1
4,0,1,35.0,8.05,0,0,0,1,1,0,0,1


In [33]:
y = train_data['Survived']

In [34]:
X = train_data.drop(['Survived'],axis=1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [36]:
LR = LogisticRegression()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LR, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LR_fit_time = scores['fit_time'].mean()
LR_score_time = scores['score_time'].mean()
LR_accuracy = scores['test_accuracy'].mean()
LR_precision = scores['test_precision_macro'].mean()
LR_recall = scores['test_recall_macro'].mean()
LR_f1 = scores['test_f1_weighted'].mean()
LR_roc = scores['test_roc_auc'].mean()

In [37]:
decision_tree = DecisionTreeClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(decision_tree, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
dtree_fit_time = scores['fit_time'].mean()
dtree_score_time = scores['score_time'].mean()
dtree_accuracy = scores['test_accuracy'].mean()
dtree_precision = scores['test_precision_macro'].mean()
dtree_recall = scores['test_recall_macro'].mean()
dtree_f1 = scores['test_f1_weighted'].mean()
dtree_roc = scores['test_roc_auc'].mean()

In [38]:
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(SVM, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
SVM_fit_time = scores['fit_time'].mean()
SVM_score_time = scores['score_time'].mean()
SVM_accuracy = scores['test_accuracy'].mean()
SVM_precision = scores['test_precision_macro'].mean()
SVM_recall = scores['test_recall_macro'].mean()
SVM_f1 = scores['test_f1_weighted'].mean()
SVM_roc = scores['test_roc_auc'].mean()

In [39]:
LDA = LinearDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LDA_fit_time = scores['fit_time'].mean()
LDA_score_time = scores['score_time'].mean()
LDA_accuracy = scores['test_accuracy'].mean()
LDA_precision = scores['test_precision_macro'].mean()
LDA_recall = scores['test_recall_macro'].mean()
LDA_f1 = scores['test_f1_weighted'].mean()
LDA_roc = scores['test_roc_auc'].mean()

In [40]:
QDA = QuadraticDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(QDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
QDA_fit_time = scores['fit_time'].mean()
QDA_score_time = scores['score_time'].mean()
QDA_accuracy = scores['test_accuracy'].mean()
QDA_precision = scores['test_precision_macro'].mean()
QDA_recall = scores['test_recall_macro'].mean()
QDA_f1 = scores['test_f1_weighted'].mean()
QDA_roc = scores['test_roc_auc'].mean()

In [41]:
random_forest = RandomForestClassifier(max_depth=5,n_estimators=100)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
forest_fit_time = scores['fit_time'].mean()
forest_score_time = scores['score_time'].mean()
forest_accuracy = scores['test_accuracy'].mean()
forest_precision = scores['test_precision_macro'].mean()
forest_recall = scores['test_recall_macro'].mean()
forest_f1 = scores['test_f1_weighted'].mean()
forest_roc = scores['test_roc_auc'].mean()

In [42]:
KNN = KNeighborsClassifier(n_neighbors=3)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(KNN, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
KNN_fit_time = scores['fit_time'].mean()
KNN_score_time = scores['score_time'].mean()
KNN_accuracy = scores['test_accuracy'].mean()
KNN_precision = scores['test_precision_macro'].mean()
KNN_recall = scores['test_recall_macro'].mean()
KNN_f1 = scores['test_f1_weighted'].mean()
KNN_roc = scores['test_roc_auc'].mean()

In [43]:
bayes = GaussianNB()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(bayes, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
bayes_fit_time = scores['fit_time'].mean()
bayes_score_time = scores['score_time'].mean()
bayes_accuracy = scores['test_accuracy'].mean()
bayes_precision = scores['test_precision_macro'].mean()
bayes_recall = scores['test_recall_macro'].mean()
bayes_f1 = scores['test_f1_weighted'].mean()
bayes_roc = scores['test_roc_auc'].mean()

In [44]:
SGD = SGDClassifier(loss="hinge", penalty="l2", max_iter=45)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(SGD, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
SGD_fit_time = scores['fit_time'].mean()
SGD_score_time = scores['score_time'].mean()
SGD_accuracy = scores['test_accuracy'].mean()
SGD_precision = scores['test_precision_macro'].mean()
SGD_recall = scores['test_recall_macro'].mean()
SGD_f1 = scores['test_f1_weighted'].mean()
SGD_roc = scores['test_roc_auc'].mean()

In [45]:
ExtraTrees = ExtraTreesClassifier(max_depth=4,n_estimators=200, random_state=0)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(ExtraTrees, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
ExtraTrees_fit_time = scores['fit_time'].mean()
ExtraTrees_score_time = scores['score_time'].mean()
ExtraTrees_accuracy = scores['test_accuracy'].mean()
ExtraTrees_precision = scores['test_precision_macro'].mean()
ExtraTrees_recall = scores['test_recall_macro'].mean()
ExtraTrees_f1 = scores['test_f1_weighted'].mean()
ExtraTrees_roc = scores['test_roc_auc'].mean()

In [46]:
LGBM = LGBMClassifier(max_depth=6,learning_rate=0.1,n_estimators=100)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LGBM, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LGBM_fit_time = scores['fit_time'].mean()
LGBM_score_time = scores['score_time'].mean()
LGBM_accuracy = scores['test_accuracy'].mean()
LGBM_precision = scores['test_precision_macro'].mean()
LGBM_recall = scores['test_recall_macro'].mean()
LGBM_f1 = scores['test_f1_weighted'].mean()
LGBM_roc = scores['test_roc_auc'].mean()

In [47]:
models_initial = pd.DataFrame({
    'Model'       : ['Logistic Regression', 'Decision Tree', 'Support Vector Machine', 'Linear Discriminant Analysis', 'Quadratic Discriminant Analysis', 'Random Forest', 'K-Nearest Neighbors', 'Bayes','SGD','ExtraTrees','LGBM'],
    'Fitting time': [LR_fit_time, dtree_fit_time, SVM_fit_time, LDA_fit_time, QDA_fit_time, forest_fit_time, KNN_fit_time, bayes_fit_time, SGD_fit_time, ExtraTrees_fit_time,LGBM_fit_time],
    'Scoring time': [LR_score_time, dtree_score_time, SVM_score_time, LDA_score_time, QDA_score_time, forest_score_time, KNN_score_time, bayes_score_time, SGD_score_time, ExtraTrees_score_time,LGBM_score_time],
    'Accuracy'    : [LR_accuracy, dtree_accuracy, SVM_accuracy, LDA_accuracy, QDA_accuracy, forest_accuracy, KNN_accuracy, bayes_accuracy, SGD_accuracy, ExtraTrees_accuracy,LGBM_accuracy],
    'Precision'   : [LR_precision, dtree_precision, SVM_precision, LDA_precision, QDA_precision, forest_precision, KNN_precision, bayes_precision, SGD_precision, ExtraTrees_precision,LGBM_precision],
    'Recall'      : [LR_recall, dtree_recall, SVM_recall, LDA_recall, QDA_recall, forest_recall, KNN_recall, bayes_recall, SGD_recall, ExtraTrees_recall,LGBM_recall],
    'F1_score'    : [LR_f1, dtree_f1, SVM_f1, LDA_f1, QDA_f1, forest_f1, KNN_f1, bayes_f1, SGD_f1, ExtraTrees_f1,LGBM_f1],
    'AUC_ROC'     : [LR_roc, dtree_roc, SVM_roc, LDA_roc, QDA_roc, forest_roc, KNN_roc, bayes_roc, SGD_roc, ExtraTrees_roc,LGBM_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

models_initial.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Fitting time,Scoring time,Accuracy,Precision,Recall,F1_score,AUC_ROC
5,Random Forest,0.44122,0.084873,0.814171,0.827349,0.784712,0.8076,0.858687
9,ExtraTrees,0.748349,0.145014,0.812701,0.834501,0.774409,0.803287,0.845652
3,Linear Discriminant Analysis,0.0342,0.024843,0.799465,0.797574,0.779217,0.796461,0.852988
10,LGBM,0.214398,0.044981,0.79336,0.793062,0.771786,0.789148,0.867366
0,Logistic Regression,0.140984,0.028424,0.790463,0.787115,0.769084,0.787282,0.851862
7,Bayes,0.012814,0.026081,0.771034,0.762237,0.75826,0.769617,0.807213
1,Decision Tree,0.015808,0.018601,0.747103,0.742238,0.727953,0.744115,0.724989
4,Quadratic Discriminant Analysis,0.022841,0.021347,0.699064,0.704034,0.666992,0.683601,0.712305
6,K-Nearest Neighbors,0.021791,0.05306,0.669073,0.651639,0.634153,0.65898,0.677962
2,Support Vector Machine,0.137225,0.017859,0.658734,0.645448,0.58451,0.609157,0.716966


In [48]:
# build the lightgbm model
import lightgbm as lgb
clf = lgb.LGBMClassifier(max_depth=6,n_extimators=100,learning_rate=0.1)
clf.fit(X_train, y_train)

LGBMClassifier(max_depth=6, n_extimators=100)

In [49]:
y_pred = clf.predict(X_test)
accuracy_score(y_pred,y_test)

0.8654708520179372

In [50]:
clf.fit(X,y)

LGBMClassifier(max_depth=6, n_extimators=100)

In [51]:
test = pd.read_csv('test.csv')

In [52]:
test.drop(['Name','Ticket'],axis=1,inplace=True)

In [53]:
test['Family_Size'] = test['SibSp']+test['Parch']
test['Cabin']=test['Cabin'].fillna('NA')
for s in range(len(train_data['Cabin'])):
    if train_data['Cabin'][s]=='NA':
        pass
    else:
        train_data['Cabin'][s] = 'C'
test['Embarked'].fillna(method='ffill',inplace=True)
test['Age'].fillna(method='ffill',inplace=True)
test.drop(['PassengerId','SibSp','Parch'],inplace=True,axis=1)
lb_pclass = LabelBinarizer()
pclass=lb_pclass.fit_transform(test['Pclass'])
pclass = pd.DataFrame(pclass,columns=lb_pclass.classes_).add_prefix('Pclass_')
test = pd.concat([test,pclass],axis=1)
lb_gender = LabelBinarizer()
gender=lb_gender.fit_transform(test['Sex'])
test['Sex'] = pd.DataFrame(gender)
lb_cabin = LabelBinarizer()
cabin=lb_cabin.fit_transform(test['Cabin'])
cabin = pd.DataFrame(cabin)
test['Cabin'] = cabin
lb_embarked = LabelBinarizer()
embarked=lb_embarked.fit_transform(test['Embarked'])
embarked = pd.DataFrame(embarked,columns=lb_embarked.classes_).add_prefix('Embarked_')
test = pd.concat([test,embarked],axis=1)
test.drop(['Embarked','Pclass'],axis=1,inplace=True)

In [197]:
submit = clf.predict(test)

In [198]:
submit

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [199]:
submit2 = pd.DataFrame(submit)

In [200]:
test2 = pd.read_csv('test.csv')

In [201]:
submit2 = pd.concat([test2['PassengerId'],submit2],axis=1)

In [202]:
submit2

Unnamed: 0,PassengerId,0
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [203]:
submit2.columns

Index(['PassengerId', 0], dtype='object')

In [204]:
submit2.rename(columns={0:'Survived'},inplace=True)

In [206]:
submit2.to_csv('submission2.csv',index=False)