In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/kaggle/input/titanic-dataset/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.shape

(891, 12)

In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df.drop(columns=['Name','PassengerId'] ,axis=1 ,inplace=True)

In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols ,len(categorical_cols)

(Index(['Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object'), 4)

In [7]:
for col in categorical_cols:
    print(f'Uniques values in {col} : {df[col].unique()}\nNo of Unique values : {len(df[col].unique())}\n')

Uniques values in Sex : ['male' 'female']
No of Unique values : 2

Uniques values in Ticket : ['A/5 21171' 'PC 17599' 'STON/O2. 3101282' '113803' '373450' '330877'
 '17463' '349909' '347742' '237736' 'PP 9549' '113783' 'A/5. 2151'
 '347082' '350406' '248706' '382652' '244373' '345763' '2649' '239865'
 '248698' '330923' '113788' '347077' '2631' '19950' '330959' '349216'
 'PC 17601' 'PC 17569' '335677' 'C.A. 24579' 'PC 17604' '113789' '2677'
 'A./5. 2152' '345764' '2651' '7546' '11668' '349253' 'SC/Paris 2123'
 '330958' 'S.C./A.4. 23567' '370371' '14311' '2662' '349237' '3101295'
 'A/4. 39886' 'PC 17572' '2926' '113509' '19947' 'C.A. 31026' '2697'
 'C.A. 34651' 'CA 2144' '2669' '113572' '36973' '347088' 'PC 17605' '2661'
 'C.A. 29395' 'S.P. 3464' '3101281' '315151' 'C.A. 33111' 'S.O.C. 14879'
 '2680' '1601' '348123' '349208' '374746' '248738' '364516' '345767'
 '345779' '330932' '113059' 'SO/C 14885' '3101278' 'W./C. 6608'
 'SOTON/OQ 392086' '343275' '343276' '347466' 'W.E.P. 5734' 'C.A.

In [8]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [9]:
from sklearn.impute import SimpleImputer

age_imputer = SimpleImputer(strategy='median')

df['Age'] = age_imputer.fit_transform(df[['Age']])
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Cabin'].fillna('Unknown', inplace=True)  # Placeholder value

In [10]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' avoids multicollinearity

In [12]:
categorical_cols = ['Sex', 'Embarked']
encoded_data = encoder.fit_transform(df[categorical_cols])

In [13]:
encoded_data.shape

(891, 3)

In [14]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

In [15]:
encoded_df

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S
0,1.0,0.0,1.0
1,0.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,1.0,0.0,1.0
...,...,...,...
886,1.0,0.0,1.0
887,0.0,0.0,1.0
888,0.0,0.0,1.0
889,1.0,0.0,0.0


In [16]:
df_encoded = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

In [17]:
df_encoded

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,A/5 21171,7.2500,Unknown,1.0,0.0,1.0
1,1,1,38.0,1,0,PC 17599,71.2833,C85,0.0,0.0,0.0
2,1,3,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,0.0,0.0,1.0
3,1,1,35.0,1,0,113803,53.1000,C123,0.0,0.0,1.0
4,0,3,35.0,0,0,373450,8.0500,Unknown,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,211536,13.0000,Unknown,1.0,0.0,1.0
887,1,1,19.0,0,0,112053,30.0000,B42,0.0,0.0,1.0
888,0,3,28.0,1,2,W./C. 6607,23.4500,Unknown,0.0,0.0,1.0
889,1,1,26.0,0,0,111369,30.0000,C148,1.0,0.0,0.0


In [18]:
from sklearn.preprocessing import OrdinalEncoder

ticket_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_encoded['Ticket_encoded'] = ticket_encoder.fit_transform(df_encoded[['Ticket']])

cabin_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_encoded['Cabin_encoded'] = cabin_encoder.fit_transform(df_encoded[['Cabin']])

In [19]:
df_encoded.drop(columns=['Ticket', 'Cabin'], inplace=True)

In [20]:
df_encoded.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,0,3,22.0,1,0,7.25,1.0,0.0,1.0,523.0,147.0
1,1,1,38.0,1,0,71.2833,0.0,0.0,0.0,596.0,81.0
2,1,3,26.0,0,0,7.925,0.0,0.0,1.0,669.0,147.0
3,1,1,35.0,1,0,53.1,0.0,0.0,1.0,49.0,55.0
4,0,3,35.0,0,0,8.05,1.0,0.0,1.0,472.0,147.0


In [21]:
df_encoded.isnull().sum()

Survived          0
Pclass            0
Age               0
SibSp             0
Parch             0
Fare              0
Sex_male          0
Embarked_Q        0
Embarked_S        0
Ticket_encoded    0
Cabin_encoded     0
dtype: int64

In [22]:
df_encoded.drop_duplicates(inplace=True)

In [23]:
df_encoded.shape

(875, 11)

In [24]:
df_encoded.sample(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
588,0,3,22.0,0,0,8.05,1.0,0.0,1.0,79.0,147.0
105,0,3,28.0,0,0,7.8958,1.0,0.0,1.0,354.0,147.0
159,0,3,28.0,8,2,69.55,1.0,0.0,1.0,568.0,147.0
750,1,2,4.0,1,1,23.0,0.0,0.0,1.0,234.0,147.0
483,1,3,63.0,0,0,9.5875,0.0,0.0,1.0,489.0,147.0
650,0,3,28.0,0,0,7.8958,1.0,0.0,1.0,366.0,147.0
30,0,1,40.0,0,0,27.7208,1.0,0.0,0.0,598.0,147.0
125,1,3,12.0,1,0,11.2417,1.0,0.0,0.0,186.0,147.0
566,0,3,19.0,0,0,7.8958,1.0,0.0,1.0,352.0,147.0
808,0,2,39.0,0,0,13.0,1.0,0.0,1.0,154.0,147.0


In [25]:
df_encoded.isnull().sum()

Survived          0
Pclass            0
Age               0
SibSp             0
Parch             0
Fare              0
Sex_male          0
Embarked_Q        0
Embarked_S        0
Ticket_encoded    0
Cabin_encoded     0
dtype: int64

In [26]:
df_X = df_encoded.drop(columns=['Survived'])
df_y = df_encoded['Survived']

In [27]:
df_X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,3,22.0,1,0,7.25,1.0,0.0,1.0,523.0,147.0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0,596.0,81.0
2,3,26.0,0,0,7.925,0.0,0.0,1.0,669.0,147.0
3,1,35.0,1,0,53.1,0.0,0.0,1.0,49.0,55.0
4,3,35.0,0,0,8.05,1.0,0.0,1.0,472.0,147.0


In [28]:
df_y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 875, dtype: int64

In [29]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [30]:
# Select numeric columns
numeric_cols = ['Pclass','Age','SibSp','Parch','Fare','Ticket_encoded','Cabin_encoded']

# Fit and transform the data
df_X[numeric_cols] = scaler.fit_transform(df_X[numeric_cols])

In [31]:
df_X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,1.0,0.271174,0.125,0.000000,0.014151,1.0,0.0,1.0,0.769118,1.000000
1,0.0,0.472229,0.125,0.000000,0.139136,0.0,0.0,0.0,0.876471,0.551020
2,1.0,0.321438,0.000,0.000000,0.015469,0.0,0.0,1.0,0.983824,1.000000
3,0.0,0.434531,0.125,0.000000,0.103644,0.0,0.0,1.0,0.072059,0.374150
4,1.0,0.434531,0.000,0.000000,0.015713,1.0,0.0,1.0,0.694118,1.000000
...,...,...,...,...,...,...,...,...,...,...
886,0.5,0.334004,0.000,0.000000,0.025374,1.0,0.0,1.0,0.148529,1.000000
887,0.0,0.233476,0.000,0.000000,0.058556,0.0,0.0,1.0,0.020588,0.204082
888,1.0,0.346569,0.125,0.333333,0.045771,0.0,0.0,1.0,0.992647,1.000000
889,0.0,0.321438,0.000,0.000000,0.058556,1.0,0.0,0.0,0.011765,0.408163


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [34]:
X_train.shape

(700, 10)

In [35]:
X_test.shape

(175, 10)

In [104]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [107]:
X_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
809,0.0,0.409399,0.125,0.000000,0.103644,0.0,0.0,1.0,0.075000,0.931973
320,1.0,0.271174,0.000,0.000000,0.014151,1.0,0.0,1.0,0.770588,1.000000
384,1.0,0.346569,0.000,0.000000,0.015412,1.0,0.0,1.0,0.545588,1.000000
802,0.0,0.132948,0.125,0.333333,0.234224,1.0,0.0,1.0,0.048529,0.319728
426,0.5,0.346569,0.125,0.000000,0.050749,0.0,0.0,1.0,0.147059,1.000000
...,...,...,...,...,...,...,...,...,...,...
212,1.0,0.271174,0.000,0.000000,0.014151,1.0,0.0,1.0,0.773529,1.000000
703,1.0,0.308872,0.000,0.000000,0.015111,1.0,1.0,0.0,0.660294,1.000000
81,1.0,0.359135,0.000,0.000000,0.018543,1.0,0.0,1.0,0.457353,1.000000
781,0.0,0.208344,0.125,0.000000,0.111257,0.0,0.0,1.0,0.130882,0.129252


In [105]:
y_pred = model.predict(X_test)

In [106]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)


Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.83       101
           1       0.78      0.72      0.75        74

    accuracy                           0.79       175
   macro avg       0.79      0.78      0.79       175
weighted avg       0.79      0.79      0.79       175



# ***USING SMOTE TECHNIQUE***

In [39]:
# from imblearn.over_sampling import SMOTE

# # Apply SMOTE to balance the dataset
# smote = SMOTE()
# X_resampled, y_resampled = smote.fit_resample(df_X, df_y)

# # Split the resampled dataset
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# # Train the model again
# model = LogisticRegression()
# model.fit(X_train, y_train)

# # Make predictions and evaluate
# y_pred = model.predict(X_test)

# # Evaluation
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print(f"Accuracy: {accuracy:.2f}")
# print("Classification Report:")
# print(report)


In [40]:
from sklearn.model_selection import GridSearchCV

# Initialize the model
model = LogisticRegression()

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

# Setup GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Make predictions and evaluate
y_pred = best_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)


Best Parameters: {'C': 1, 'solver': 'liblinear'}
Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.86      0.82       101
           1       0.78      0.66      0.72        74

    accuracy                           0.78       175
   macro avg       0.78      0.76      0.77       175
weighted avg       0.78      0.78      0.77       175



In [113]:
test_df = pd.read_csv('/kaggle/input/titanic-dataset/test.csv')
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [80]:
test_df[test_df['Cabin']=='']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [81]:
test_df.shape

(418, 11)

In [82]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [83]:
test_df['Age'] = age_imputer.transform(test_df[['Age']])
test_df['Cabin'].fillna('Unknown', inplace=True)  # Placeholder value

fare_imputer = SimpleImputer(strategy='median')
test_df['Fare'] = fare_imputer.fit_transform(test_df[['Fare']])

In [84]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [85]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,Unknown,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Unknown,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Unknown,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,28.0,0,0,A.5. 3236,8.0500,Unknown,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Unknown,S
416,1308,3,"Ware, Mr. Frederick",male,28.0,0,0,359309,8.0500,Unknown,S


In [86]:
test_df.drop(columns=['Name','PassengerId'] ,axis=1 ,inplace=True)

In [87]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,Unknown,Q
1,3,female,47.0,1,0,363272,7.0,Unknown,S
2,2,male,62.0,0,0,240276,9.6875,Unknown,Q
3,3,male,27.0,0,0,315154,8.6625,Unknown,S
4,3,female,22.0,1,1,3101298,12.2875,Unknown,S


In [88]:
categorical_cols = ['Sex', 'Embarked']
encoded_data = encoder.transform(test_df[categorical_cols])

In [89]:
encoded_data

array([[1., 1., 0.],
       [0., 0., 1.],
       [1., 1., 0.],
       ...,
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 0.]])

In [90]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

In [91]:
df_encoded = pd.concat([test_df.drop(columns=categorical_cols), encoded_df], axis=1)

In [92]:
df_encoded.shape

(418, 10)

In [93]:
df_encoded.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,0,0,330911,7.8292,Unknown,1.0,1.0,0.0
1,3,47.0,1,0,363272,7.0,Unknown,0.0,0.0,1.0
2,2,62.0,0,0,240276,9.6875,Unknown,1.0,1.0,0.0
3,3,27.0,0,0,315154,8.6625,Unknown,1.0,0.0,1.0
4,3,22.0,1,1,3101298,12.2875,Unknown,0.0,0.0,1.0


In [94]:
# For Ticket
df_encoded['Ticket_encoded'] = ticket_encoder.transform(test_df[['Ticket']])

# For Cabin
df_encoded['Cabin_encoded'] = cabin_encoder.transform(test_df[['Cabin']])

In [95]:
df_encoded.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,3,34.5,0,0,330911,7.8292,Unknown,1.0,1.0,0.0,-1.0,147.0
1,3,47.0,1,0,363272,7.0,Unknown,0.0,0.0,1.0,-1.0,147.0
2,2,62.0,0,0,240276,9.6875,Unknown,1.0,1.0,0.0,-1.0,147.0
3,3,27.0,0,0,315154,8.6625,Unknown,1.0,0.0,1.0,-1.0,147.0
4,3,22.0,1,1,3101298,12.2875,Unknown,0.0,0.0,1.0,251.0,147.0


In [96]:
df_encoded.drop(columns=['Ticket','Cabin'],axis=1,inplace=True)

In [97]:
df_encoded.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,3,34.5,0,0,7.8292,1.0,1.0,0.0,-1.0,147.0
1,3,47.0,1,0,7.0,0.0,0.0,1.0,-1.0,147.0
2,2,62.0,0,0,9.6875,1.0,1.0,0.0,-1.0,147.0
3,3,27.0,0,0,8.6625,1.0,0.0,1.0,-1.0,147.0
4,3,22.0,1,1,12.2875,0.0,0.0,1.0,251.0,147.0


In [100]:
# Select numeric columns
numeric_cols = ['Pclass','Age','SibSp','Parch','Fare','Ticket_encoded','Cabin_encoded']

# Fit and transform the data
df_encoded[numeric_cols] = scaler.transform(df_encoded[numeric_cols])

In [108]:
df_encoded

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,1.0,0.428248,0.000,0.000000,0.015282,1.0,1.0,0.0,-0.001471,1.000000
1,1.0,0.585323,0.125,0.000000,0.013663,0.0,0.0,1.0,-0.001471,1.000000
2,0.5,0.773813,0.000,0.000000,0.018909,1.0,1.0,0.0,-0.001471,1.000000
3,1.0,0.334004,0.000,0.000000,0.016908,1.0,0.0,1.0,-0.001471,1.000000
4,1.0,0.271174,0.125,0.166667,0.023984,0.0,0.0,1.0,0.369118,1.000000
...,...,...,...,...,...,...,...,...,...,...
413,1.0,0.346569,0.000,0.000000,0.015713,1.0,0.0,1.0,-0.001471,1.000000
414,0.0,0.484795,0.000,0.000000,0.212559,0.0,0.0,0.0,0.898529,-0.006803
415,1.0,0.478512,0.000,0.000000,0.014151,1.0,0.0,1.0,-0.001471,1.000000
416,1.0,0.346569,0.000,0.000000,0.015713,1.0,0.0,1.0,-0.001471,1.000000


In [109]:
X = df_encoded
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Ticket_encoded,Cabin_encoded
0,1.0,0.428248,0.000,0.000000,0.015282,1.0,1.0,0.0,-0.001471,1.000000
1,1.0,0.585323,0.125,0.000000,0.013663,0.0,0.0,1.0,-0.001471,1.000000
2,0.5,0.773813,0.000,0.000000,0.018909,1.0,1.0,0.0,-0.001471,1.000000
3,1.0,0.334004,0.000,0.000000,0.016908,1.0,0.0,1.0,-0.001471,1.000000
4,1.0,0.271174,0.125,0.166667,0.023984,0.0,0.0,1.0,0.369118,1.000000
...,...,...,...,...,...,...,...,...,...,...
413,1.0,0.346569,0.000,0.000000,0.015713,1.0,0.0,1.0,-0.001471,1.000000
414,0.0,0.484795,0.000,0.000000,0.212559,0.0,0.0,0.0,0.898529,-0.006803
415,1.0,0.478512,0.000,0.000000,0.014151,1.0,0.0,1.0,-0.001471,1.000000
416,1.0,0.346569,0.000,0.000000,0.015713,1.0,0.0,1.0,-0.001471,1.000000


In [110]:
y = model.predict(X)

In [111]:
y

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [114]:
test_df.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q


In [126]:
import numpy as np
ids = np.array(test_df['PassengerId'])

In [127]:
ids

array([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,  902,
        903,  904,  905,  906,  907,  908,  909,  910,  911,  912,  913,
        914,  915,  916,  917,  918,  919,  920,  921,  922,  923,  924,
        925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
        936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,
        947,  948,  949,  950,  951,  952,  953,  954,  955,  956,  957,
        958,  959,  960,  961,  962,  963,  964,  965,  966,  967,  968,
        969,  970,  971,  972,  973,  974,  975,  976,  977,  978,  979,
        980,  981,  982,  983,  984,  985,  986,  987,  988,  989,  990,
        991,  992,  993,  994,  995,  996,  997,  998,  999, 1000, 1001,
       1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012,
       1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
       1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034,
       1035, 1036, 1037, 1038, 1039, 1040, 1041, 10

In [128]:
y

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [129]:
len(ids),len(y)

(418, 418)

In [138]:
sub = {
    'PassengerId' : ids,
    'Survived' : y
}

In [139]:
sub = pd.DataFrame(sub)

In [140]:
sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [141]:
sub.to_csv('/kaggle/working/submission.csv', index=False)