# TITANIC | ML | CLASSIFICATION

## 0. Imports

In [18]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, KFold

## 1. Load data 

In [93]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [94]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
passenger_ids = test_data['PassengerId']
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## 2. Data cleaning
### 2.1 Unnecessary columns

In [21]:
train_data.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [None]:
test_data.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
test_data.head()

### 2.2 Missing values

In [23]:
train_data.shape, test_data.shape

((891, 9), (418, 8))

In [24]:
# Checking which columns have missing values
train_data.isna().sum(), test_data.isna().sum()

(Survived      0
 Pclass        0
 Sex           0
 Age         177
 SibSp         0
 Parch         0
 Fare          0
 Cabin       687
 Embarked      2
 dtype: int64,
 Pclass        0
 Sex           0
 Age          86
 SibSp         0
 Parch         0
 Fare          1
 Cabin       327
 Embarked      0
 dtype: int64)

In [25]:
# removing column with the most amount of missing values from train data - Cabin (687 missing of all observation 891)
train_data.drop("Cabin", axis=1, inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [26]:
# removing column with the most amount of missing values also from test data - Cabin (687 missing of all observation 891)
test_data.drop("Cabin", axis=1, inplace=True)
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [27]:
train_data.isna().sum(), test_data.isna().sum()

(Survived      0
 Pclass        0
 Sex           0
 Age         177
 SibSp         0
 Parch         0
 Fare          0
 Embarked      2
 dtype: int64,
 Pclass       0
 Sex          0
 Age         86
 SibSp        0
 Parch        0
 Fare         1
 Embarked     0
 dtype: int64)

## 3. Imputation and encoding
### 3.1 Splitting data - train, val

In [28]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [29]:
len(train_data), train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


(891, None)

In [30]:
X = train_data.drop(columns=["Survived"])
y = train_data["Survived"]
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.3, random_state=42)

In [31]:
X_train.isna().sum(), X_val.isna().sum()

(Pclass        0
 Sex           0
 Age         124
 SibSp         0
 Parch         0
 Fare          0
 Embarked      1
 dtype: int64,
 Pclass       0
 Sex          0
 Age         53
 SibSp        0
 Parch        0
 Fare         0
 Embarked     1
 dtype: int64)

### 3.2 Imputation - missing values

In [32]:
num_columns = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
cat_columns = ["Embarked", "Sex"]
num_columns, cat_columns

(['Age', 'SibSp', 'Parch', 'Fare', 'Pclass'], ['Embarked', 'Sex'])

In [33]:
# # Imputation: filling missing values of Age (numerical feature) with mean value of column  - without SimpleImputer()
# train_data_cleaned = train_data.copy()
# train_data_cleaned["Age"] = train_data_cleaned["Age"].fillna(train_data_cleaned["Age"].mean())

In [34]:
X_train_cleaned = X_train.copy()
X_val_cleaned = X_val.copy()

In [35]:
# numerical features 
imputer_num = SimpleImputer()
imputer_num.fit(X_train_cleaned[num_columns])
X_train_cleaned[num_columns] = imputer_num.transform(X_train_cleaned[num_columns])
X_val_cleaned[num_columns] = imputer_num.transform(X_val_cleaned[num_columns])

In [36]:
# # Imputation: filling missing values of Embarked (categorical feature) with the most common value - without SimpleImputer
# train_data_cleaned["Embarked"] = train_data_cleaned["Embarked"].fillna(train_data_cleaned["Embarked"].mode()[0])

In [37]:
# categorical features 
imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_cat.fit(X_train_cleaned[cat_columns])
X_train_cleaned[cat_columns] = imputer_cat.transform(X_train_cleaned[cat_columns])
X_val_cleaned[cat_columns] = imputer_cat.transform(X_val_cleaned[cat_columns])

In [38]:
# checking again if there are any missing values left
X_train_cleaned.isna().sum(), X_val_cleaned.isna().sum()

(Pclass      0
 Sex         0
 Age         0
 SibSp       0
 Parch       0
 Fare        0
 Embarked    0
 dtype: int64,
 Pclass      0
 Sex         0
 Age         0
 SibSp       0
 Parch       0
 Fare        0
 Embarked    0
 dtype: int64)

### 3.3 Encoding - One Hot Encoder

for categorical columns

In [39]:
X_train_cleaned[cat_columns] 

Unnamed: 0,Embarked,Sex
445,S,male
650,S,male
172,S,female
450,S,male
314,S,male
...,...,...
106,S,female
270,S,male
860,S,male
435,S,female


In [40]:
## One Hot Encoding on categorical features without OneHotEncoder class
# train_data_cleaned = pd.concat([train_data_cleaned.drop("Sex", axis=1), pd.get_dummies(train_data_cleaned["Sex"], prefix="Sex", drop_first=True)], axis=1)
# train_data_cleaned = pd.concat([train_data_cleaned.drop("Embarked", axis=1), pd.get_dummies(train_data_cleaned["Embarked"], prefix="Embarked", drop_first=True)], axis=1)

In [41]:
encoder = OneHotEncoder(drop = "first", sparse_output=False)

encoded_train = encoder.fit_transform(X_train_cleaned[cat_columns])
encoded_val = encoder.transform(X_val_cleaned[cat_columns])

encoded_cols = encoder.get_feature_names_out(cat_columns)

encoded_df_train = pd.DataFrame(encoded_train, columns=encoded_cols, index=X_train_cleaned.index)
encoded_df_val = pd.DataFrame(encoded_val, columns=encoded_cols, index=X_val_cleaned.index)

In [42]:
X_train_cleaned = X_train_cleaned.drop(columns=cat_columns)
X_train_cleaned = pd.concat([X_train_cleaned, encoded_df_train], axis=1)
X_val_cleaned = X_val_cleaned.drop(columns=cat_columns)
X_val_cleaned = pd.concat([X_val_cleaned, encoded_df_val], axis=1)

In [43]:
X_train_cleaned

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
445,1.0,4.000000,0.0,2.0,81.8583,0.0,1.0,1.0
650,3.0,29.256353,0.0,0.0,7.8958,0.0,1.0,1.0
172,3.0,1.000000,1.0,1.0,11.1333,0.0,1.0,0.0
450,2.0,36.000000,1.0,2.0,27.7500,0.0,1.0,1.0
314,2.0,43.000000,1.0,1.0,26.2500,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
106,3.0,21.000000,0.0,0.0,7.6500,0.0,1.0,0.0
270,1.0,29.256353,0.0,0.0,31.0000,0.0,1.0,1.0
860,3.0,41.000000,2.0,0.0,14.1083,0.0,1.0,1.0
435,1.0,14.000000,1.0,2.0,120.0000,0.0,1.0,0.0


In [44]:
X_val_cleaned

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
709,3.0,29.256353,1.0,1.0,15.2458,0.0,0.0,1.0
439,2.0,31.000000,0.0,0.0,10.5000,0.0,1.0,1.0
840,3.0,20.000000,0.0,0.0,7.9250,0.0,1.0,1.0
720,2.0,6.000000,0.0,1.0,33.0000,0.0,1.0,0.0
39,3.0,14.000000,1.0,0.0,11.2417,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
821,3.0,27.000000,0.0,0.0,8.6625,0.0,1.0,1.0
633,1.0,29.256353,0.0,0.0,0.0000,0.0,1.0,1.0
456,1.0,65.000000,0.0,0.0,26.5500,0.0,1.0,1.0
500,3.0,17.000000,0.0,0.0,8.6625,0.0,1.0,1.0


### 3.4 Encoding - StandardScaler
for numerical columns

In [45]:
scaler = StandardScaler()
scaler.fit(X_train_cleaned[num_columns])
X_train_cleaned[num_columns] = scaler.transform(X_train_cleaned[num_columns])
X_val_cleaned[num_columns] = scaler.transform(X_val_cleaned[num_columns])

In [46]:
X_train_cleaned

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
445,-1.637881,-1.940356e+00,-0.474161,1.998853,0.980998,0.0,1.0,1.0
650,0.803267,2.729423e-16,-0.474161,-0.479327,-0.469634,0.0,1.0,1.0
172,0.803267,-2.170835e+00,0.348687,0.759763,-0.406136,0.0,1.0,0.0
450,-0.417307,5.180904e-01,0.348687,1.998853,-0.080232,0.0,1.0,1.0
314,-0.417307,1.055876e+00,0.348687,0.759763,-0.109651,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
106,0.803267,-6.343062e-01,-0.474161,-0.479327,-0.474455,0.0,1.0,0.0
270,-1.637881,2.729423e-16,-0.474161,-0.479327,-0.016489,0.0,1.0,1.0
860,0.803267,9.022226e-01,1.171535,-0.479327,-0.347787,0.0,1.0,1.0
435,-1.637881,-1.172091e+00,0.348687,1.998853,1.729074,0.0,1.0,0.0


In [47]:
X_val_cleaned

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
709,0.803267,2.729423e-16,0.348687,0.759763,-0.325478,0.0,0.0,1.0
439,-0.417307,1.339582e-01,-0.474161,-0.479327,-0.418557,0.0,1.0,1.0
840,0.803267,-7.111326e-01,-0.474161,-0.479327,-0.469061,0.0,1.0,1.0
720,-0.417307,-1.786703e+00,-0.474161,0.759763,0.022737,0.0,1.0,0.0
39,0.803267,-1.172091e+00,0.348687,-0.479327,-0.404010,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
821,0.803267,-1.733475e-01,-0.474161,-0.479327,-0.454596,0.0,1.0,1.0
633,-1.637881,2.729423e-16,-0.474161,-0.479327,-0.624495,0.0,1.0,1.0
456,-1.637881,2.746057e+00,-0.474161,-0.479327,-0.103767,0.0,1.0,1.0
500,0.803267,-9.416120e-01,-0.474161,-0.479327,-0.454596,0.0,1.0,1.0


## 4. Modelling

In [48]:
def calculate_metrics(y_true, y_pred):
    return pd.DataFrame({"Accuracy": [accuracy_score(y_true, y_pred)],
                         "Precision": [precision_score(y_true, y_pred)],
                         "Recall": [recall_score(y_true, y_pred)],
                         "F1": [f1_score(y_true, y_pred)]}).apply(round, args=(3,))

### 4.1 Logistic Regression 

In [49]:
model = LogisticRegression()  # solver='lbfgs' , 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'
model.fit(X_train_cleaned, y_train)
y_pred = model.predict(X_val_cleaned)

In [50]:
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.81,0.794,0.73,0.761


### 4.2 Decision Tree

In [51]:
model = DecisionTreeClassifier()
model.fit(X_train_cleaned, y_train)
y_pred = model.predict(X_val_cleaned)

In [52]:
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.746,0.701,0.676,0.688


In [53]:
dtc_metrics = pd.DataFrame()

for max_depth in [2, 3, 4, 5, 6]:
    model = DecisionTreeClassifier(max_depth=max_depth)
    model.fit(X_train_cleaned, y_train)
    y_pred = model.predict(X_val_cleaned)
    
    dtc_metrics.loc[max_depth, ["Accuracy", "Precision", "Recall", "F1"]] = calculate_metrics(y_val, y_pred).iloc[0, :].values
    
dtc_metrics.index.name = "max_depth"
dtc_metrics

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.772,0.891,0.514,0.651
3,0.817,0.83,0.703,0.761
4,0.828,0.842,0.721,0.777
5,0.802,0.784,0.721,0.751
6,0.787,0.838,0.604,0.702


### 4.3 Naive Bayes

In [54]:
model = GaussianNB()
model.fit(X_train_cleaned, y_train)
y_pred = model.predict(X_val_cleaned)

In [55]:
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.795,0.755,0.748,0.751


### 4.4 Support Vector Machine

In [56]:
model = SVC()
model.fit(X_train_cleaned, y_train)
y_pred = model.predict(X_val_cleaned)

In [57]:
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.817,0.823,0.712,0.763


### 4.5 K Nearest Neighbours

In [58]:
model = KNeighborsClassifier()
model.fit(X_train_cleaned, y_train)
y_pred = model.predict(X_val_cleaned)

In [59]:
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.802,0.815,0.676,0.739


In [60]:
knn_metrics = pd.DataFrame()

for n_neighbors in [2, 3, 4, 5, 6]:
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train_cleaned, y_train)
    y_pred = model.predict(X_val_cleaned)
    
    knn_metrics.loc[n_neighbors, ["Accuracy", "Precision", "Recall", "F1"]] = calculate_metrics(y_val, y_pred).iloc[0, :].values
    
knn_metrics.index.name = "n_neighbors"
knn_metrics

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.776,0.849,0.559,0.674
3,0.795,0.775,0.712,0.742
4,0.776,0.823,0.586,0.684
5,0.802,0.815,0.676,0.739
6,0.791,0.831,0.622,0.711


### 4.6 Random Forest Classifier

In [61]:
model = RandomForestClassifier()
model.fit(X_train_cleaned, y_train)
y_pred = model.predict(X_val_cleaned)

In [62]:
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.784,0.752,0.712,0.731


In [63]:
rfc_metrics = pd.DataFrame()

for n_estimators in [10, 50, 100, 500]:
    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X_train_cleaned, y_train)
    y_pred = model.predict(X_val_cleaned)
    
    rfc_metrics.loc[n_estimators, ["Accuracy", "Precision", "Recall", "F1"]] = calculate_metrics(y_val, y_pred).iloc[0, :].values
    
rfc_metrics.index.name = "n_estimators"
rfc_metrics

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,0.791,0.772,0.703,0.736
50,0.795,0.78,0.703,0.739
100,0.787,0.76,0.712,0.735
500,0.787,0.765,0.703,0.732


## 5. Pipeline

In [64]:
num_columns, cat_columns

(['Age', 'SibSp', 'Parch', 'Fare', 'Pclass'], ['Embarked', 'Sex'])

In [65]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    # ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
    ("encoder", OneHotEncoder(drop = "first", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", num_pipe, num_columns),
    ("categorical", cat_pipe, cat_columns)
])

preprocessor.fit(X_train)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

pipeline.fit(X_train, y_train)

In [66]:
y_pred = pipeline.predict(X_val)
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.81,0.794,0.73,0.761


## 6. Cross-validation

In [68]:
param_grid = {
    "preprocessor__numeric__imputer__strategy": ["mean", "median"],
    "preprocessor__numeric__scaler": [StandardScaler(), MinMaxScaler()],
    "model__C": [0.1, 1, 10],
}
cv =KFold(n_splits=10, shuffle=True)

optimizer = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

In [69]:
optimizer.fit(X_train,y_train)

In [70]:
optimizer.best_params_

{'model__C': 0.1,
 'preprocessor__numeric__imputer__strategy': 'median',
 'preprocessor__numeric__scaler': StandardScaler()}

In [71]:
optimizer.best_score_

0.7979262672811059

In [72]:
optimizer.best_estimator_

In [73]:
y_pred = optimizer.best_estimator_.predict(X_val)
calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.821,0.832,0.712,0.767


In [74]:
model_params_lr = {
    "model": [LogisticRegression(max_iter=100, random_state=42, solver="saga")],
    "model__C": [0.1, 1, 10],
    "model__penalty": ["l1", "l2"],
}

model_params_rf = {
    "model": [RandomForestClassifier(random_state=42)],
    "model__n_estimators": [50,100,150],
    "model__max_depth":[None, 5,10],
}

base_params = [("scaler_std", StandardScaler()), ("scaler_minmax", MinMaxScaler())]

param_grid = [
    {**{"preprocessor__numeric__scaler":[s[1]]}, **model_params_lr} for s in base_params
] + [
    {**{"preprocessor__numeric__scaler":[s[1]]}, **model_params_rf} for s in base_params
]

optimizer2 = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
optimizer2.fit(X_train, y_train)
print(f"Best params: {optimizer2.best_params_}")
print(f"Best score: {optimizer2.best_score_:.3f}")

Best params: {'model': RandomForestClassifier(max_depth=10, random_state=42), 'model__max_depth': 10, 'model__n_estimators': 100, 'preprocessor__numeric__scaler': MinMaxScaler()}
Best score: 0.825


In [77]:
y_pred = optimizer2.best_estimator_.predict(X_val)

calculate_metrics(y_val, y_pred)

Unnamed: 0,Accuracy,Precision,Recall,F1
0,0.799,0.806,0.676,0.735


## 7. Prediction on the best model 

In [86]:
X_test = test_data

In [89]:
preds_test = optimizer2.best_estimator_.predict(X_test)

In [95]:
passenger_ids

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [96]:
# Save test predictions to file
output = pd.DataFrame({'PassengerId': passenger_ids,
                       'Survived': preds_test})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [97]:
output.to_csv('submission.csv', index=False)