In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px 
from sklearn.impute import SimpleImputer



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
titanic_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
titanic_df['Sex_bin'] = titanic_df.Sex.map({'female':0, 'male':1})
test_df['Sex_bin'] = test_df.Sex.map({'female':0, 'male':1})

In [4]:
del titanic_df['Sex']
del test_df['Sex']

In [5]:
titanic_df['name_bin'] = titanic_df['Name'].str.contains('\(', na=False).astype(int)
test_df['name_bin'] = test_df['Name'].str.contains('\(', na=False).astype(int)

In [6]:
del titanic_df['Name']
del test_df['Name']

# Imputer for  missing Age 

In [7]:
imputer = SimpleImputer(strategy='median')
titanic_df['Age'] = imputer.fit_transform(titanic_df[['Age']])
test_df['Age'] = imputer.transform(test_df[['Age']])

In [8]:
titanic_df['cabin_bin']=titanic_df['Cabin'].isna().astype(int)
test_df['cabin_bin']=test_df['Cabin'].isna().astype(int)

In [9]:
titanic_df['Deck'] = titanic_df['Cabin'].str[0].fillna('Unknown')
test_df['Deck'] = test_df['Cabin'].str[0].fillna('Unknown')

In [10]:
titanic_df = titanic_df.drop(columns=['Cabin'])
test_df = test_df.drop(columns=['Cabin'])

In [11]:
titanic_df = titanic_df.dropna(subset=['Embarked'])

In [12]:
titanic_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Sex_bin        0
name_bin       0
cabin_bin      0
Deck           0
dtype: int64

In [13]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Age          889 non-null    float64
 4   SibSp        889 non-null    int64  
 5   Parch        889 non-null    int64  
 6   Ticket       889 non-null    object 
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    object 
 9   Sex_bin      889 non-null    int64  
 10  name_bin     889 non-null    int64  
 11  cabin_bin    889 non-null    int64  
 12  Deck         889 non-null    object 
dtypes: float64(2), int64(8), object(3)
memory usage: 97.2+ KB


In [14]:
titanic_df.Ticket.value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 680, dtype: int64

In [15]:
titanic_df[['SibSp','Parch','Survived', 'Pclass', 'Fare', 'Age',]].corr()

Unnamed: 0,SibSp,Parch,Survived,Pclass,Fare,Age
SibSp,1.0,0.414542,-0.03404,0.081656,0.160887,-0.232543
Parch,0.414542,1.0,0.083151,0.016824,0.217532,-0.171485
Survived,-0.03404,0.083151,1.0,-0.335549,0.25529,-0.069822
Pclass,0.081656,0.016824,-0.335549,1.0,-0.548193,-0.336512
Fare,0.160887,0.217532,0.25529,-0.548193,1.0,0.093707
Age,-0.232543,-0.171485,-0.069822,-0.336512,0.093707,1.0


In [16]:
px.histogram(titanic_df, x='SibSp', color='Survived')

In [17]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck'],
      dtype='object')

In [18]:
input_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck']
target_col = 'Survived'

In [19]:
numeric_cols = titanic_df[input_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = titanic_df[input_cols].select_dtypes('object').columns.tolist()

In [20]:
titanic_df[numeric_cols +['Survived']].corr()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_bin,name_bin,cabin_bin,Survived
PassengerId,1.0,-0.03533,0.031319,-0.057686,-0.001657,0.012703,0.043136,0.01658,-0.020045,-0.005028
Pclass,-0.03533,1.0,-0.336512,0.081656,0.016824,-0.548193,0.127741,-0.162846,0.723815,-0.335549
Age,0.031319,-0.336512,1.0,-0.232543,-0.171485,0.093707,0.086506,0.159285,-0.23555,-0.069822
SibSp,-0.057686,0.081656,-0.232543,1.0,0.414542,0.160887,-0.116348,0.043327,0.038657,-0.03404
Parch,-0.001657,0.016824,-0.171485,0.414542,1.0,0.217532,-0.247508,0.208265,-0.039101,0.083151
Fare,0.012703,-0.548193,0.093707,0.160887,0.217532,1.0,-0.179958,0.088768,-0.480425,0.25529
Sex_bin,0.043136,0.127741,0.086506,-0.116348,-0.247508,-0.179958,1.0,-0.502852,0.135589,-0.541585
name_bin,0.01658,-0.162846,0.159285,0.043327,0.208265,0.088768,-0.502852,1.0,-0.137273,0.345536
cabin_bin,-0.020045,0.723815,-0.23555,0.038657,-0.039101,-0.480425,0.135589,-0.137273,1.0,-0.313435
Survived,-0.005028,-0.335549,-0.069822,-0.03404,0.083151,0.25529,-0.541585,0.345536,-0.313435,1.0


In [21]:
px.histogram(titanic_df, x='Fare', color='Survived')

In [22]:
titanic_df[numeric_cols].skew()

PassengerId    0.000000
Pclass        -0.636998
Age            0.508010
SibSp          3.691058
Parch          2.745160
Fare           4.801440
Sex_bin       -0.625625
name_bin       1.860736
cabin_bin     -1.304132
dtype: float64

 # Dealing with FARE Column

In [23]:
titanic_df['Fare'].describe()

count    889.000000
mean      32.096681
std       49.697504
min        0.000000
25%        7.895800
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [24]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(titanic_df[['Fare']])
test_df['Fare'] = imputer.transform(test_df[['Fare']])

In [25]:
test_df.isna().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Sex_bin        0
name_bin       0
cabin_bin      0
Deck           0
dtype: int64

In [26]:
titanic_df[numeric_cols].skew()

PassengerId    0.000000
Pclass        -0.636998
Age            0.508010
SibSp          3.691058
Parch          2.745160
Fare           4.801440
Sex_bin       -0.625625
name_bin       1.860736
cabin_bin     -1.304132
dtype: float64

In [27]:
titanic_df['Fare'] = np.log1p(titanic_df['Fare'])
test_df['Fare'] = np.log1p(test_df['Fare'])

In [28]:
test_df[numeric_cols].skew()

PassengerId    0.000000
Pclass        -0.534170
Age            0.618776
SibSp          4.168337
Parch          4.654462
Fare           0.864955
Sex_bin       -0.568991
name_bin       1.614646
cabin_bin     -1.373031
dtype: float64

In [29]:
test_df['Fare'].describe()

count    418.000000
mean       3.015421
std        0.966932
min        0.000000
25%        2.185579
50%        2.737881
75%        3.480373
max        6.240917
Name: Fare, dtype: float64

In [30]:
px.histogram(test_df, x='Fare')

# SibSp & Parch

In [31]:
titanic_df[['SibSp','Parch','Survived']].corr()

Unnamed: 0,SibSp,Parch,Survived
SibSp,1.0,0.414542,-0.03404
Parch,0.414542,1.0,0.083151
Survived,-0.03404,0.083151,1.0


In [32]:
titanic_df[['SibSp','Parch','Survived']].skew()

SibSp       3.691058
Parch       2.745160
Survived    0.484568
dtype: float64

In [33]:
px.histogram(titanic_df, x='SibSp', color='Survived')

In [34]:
px.histogram(titanic_df, x='Parch', color='Survived')

In [35]:
titanic_df['Parch'].value_counts(normalize=True)

Parch
0    0.760405
1    0.132733
2    0.089989
5    0.005624
3    0.005624
4    0.004499
6    0.001125
Name: proportion, dtype: float64

In [36]:
titanic_df.groupby('Parch')['Survived'].sum()

Parch
0    231
1     65
2     40
3      3
4      0
5      1
6      0
Name: Survived, dtype: int64

In [37]:
titanic_df['Parch'].describe()

count    889.000000
mean       0.382452
std        0.806761
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [38]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Embarked,Sex_bin,name_bin,cabin_bin,Deck
0,1,0,3,22.0,1,0,A/5 21171,2.110213,S,1,0,1,Unknown
1,2,1,1,38.0,1,0,PC 17599,4.280593,C,0,1,0,C
2,3,1,3,26.0,0,0,STON/O2. 3101282,2.188856,S,0,0,1,Unknown
3,4,1,1,35.0,1,0,113803,3.990834,S,0,1,0,C
4,5,0,3,35.0,0,0,373450,2.202765,S,1,0,1,Unknown


In [39]:
titanic_df['is_alone'] = ((titanic_df['SibSp'] == 0) & (titanic_df['Parch'] == 0)).astype(int)
test_df['is_alone'] = ((test_df['SibSp'] == 0) & (test_df['Parch'] == 0)).astype(int)

In [40]:
titanic_df = titanic_df.drop(columns=['SibSp', 'Parch'])
test_df = test_df.drop(columns=['SibSp', 'Parch'])

In [41]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Ticket,Fare,Embarked,Sex_bin,name_bin,cabin_bin,Deck,is_alone
0,892,3,34.5,330911,2.178064,Q,1,0,1,Unknown,1
1,893,3,47.0,363272,2.079442,S,0,1,1,Unknown,0
2,894,2,62.0,240276,2.369075,Q,1,0,1,Unknown,1
3,895,3,27.0,315154,2.268252,S,1,0,1,Unknown,1
4,896,3,22.0,3101298,2.586824,S,0,1,1,Unknown,0


In [42]:
titanic_df[['is_alone', 'Survived']].corr()

Unnamed: 0,is_alone,Survived
is_alone,1.0,-0.206207
Survived,-0.206207,1.0


In [43]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'Ticket', 'Fare',
       'Embarked', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck', 'is_alone'],
      dtype='object')

In [44]:
input_cols = ['PassengerId','Pclass', 'Age', 'Ticket', 'Fare',
       'Embarked', 'Sex_bin', 'name_bin', 'cabin_bin', 'Deck', 'is_alone']
target_cols = 'Survived'

In [45]:
numeric_cols = titanic_df[input_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = titanic_df[input_cols].select_dtypes('object').columns.tolist()

In [46]:
cat_cols

['Ticket', 'Embarked', 'Deck']

# Categorical Columns

In [47]:
titanic_df['Ticket'].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 680, dtype: int64

In [48]:
titanic_df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [49]:
titanic_df.groupby('Embarked')['Survived'].sum()

Embarked
C     93
Q     30
S    217
Name: Survived, dtype: int64

In [50]:
titanic_df.groupby('Embarked')['Survived'].mean()

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

In [51]:
px.histogram(titanic_df, x='Embarked', color='Pclass')

In [52]:
titanic_df['Deck'].nunique()

9

In [53]:
titanic_df['Deck'].value_counts()

Deck
Unknown    687
C           59
B           45
D           33
E           32
A           15
F           13
G            4
T            1
Name: count, dtype: int64

In [54]:
px.histogram(titanic_df, x='Deck', color='Pclass')

In [55]:
titanic_df.groupby('Deck')['Survived'].mean()

Deck
A          0.466667
B          0.733333
C          0.593220
D          0.757576
E          0.750000
F          0.615385
G          0.500000
T          0.000000
Unknown    0.299854
Name: Survived, dtype: float64

In [56]:
from sklearn.preprocessing import OneHotEncoder

In [57]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(titanic_df[['Embarked']])
encoded_cols = encoder.get_feature_names_out(['Embarked'])
encoded_cols

array(['Embarked_C', 'Embarked_Q', 'Embarked_S'], dtype=object)

In [58]:
titanic_df[encoded_cols]=encoder.transform(titanic_df[['Embarked']])
test_df[encoded_cols]=encoder.transform(test_df[['Embarked']])

In [59]:
titanic_df = titanic_df.drop(columns=['Embarked'])
test_df = test_df.drop(columns=['Embarked'])

In [60]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,Deck,is_alone,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,A/5 21171,2.110213,1,0,1,Unknown,0,0.0,0.0,1.0
1,2,1,1,38.0,PC 17599,4.280593,0,1,0,C,0,1.0,0.0,0.0
2,3,1,3,26.0,STON/O2. 3101282,2.188856,0,0,1,Unknown,1,0.0,0.0,1.0
3,4,1,1,35.0,113803,3.990834,0,1,0,C,0,0.0,0.0,1.0
4,5,0,3,35.0,373450,2.202765,1,0,1,Unknown,1,0.0,0.0,1.0


In [61]:
titanic_df[['Embarked_C', 'Embarked_Q', 'Embarked_S', 'Survived']].corr()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S,Survived
Embarked_C,1.0,-0.148646,-0.782613,0.169966
Embarked_Q,-0.148646,1.0,-0.499261,0.004536
Embarked_S,-0.782613,-0.499261,1.0,-0.151777
Survived,0.169966,0.004536,-0.151777,1.0


In [62]:
titanic_df[['Embarked_C', 'Embarked_Q', 'Embarked_S']].skew()

Embarked_C    1.591610
Embarked_Q    2.944406
Embarked_S   -1.006192
dtype: float64

In [63]:
titanic_df['Deck'].value_counts(normalize=True)

Deck
Unknown    0.772778
C          0.066367
B          0.050619
D          0.037120
E          0.035996
A          0.016873
F          0.014623
G          0.004499
T          0.001125
Name: proportion, dtype: float64

In [64]:
titanic_df.groupby('Deck')['Survived'].mean()

Deck
A          0.466667
B          0.733333
C          0.593220
D          0.757576
E          0.750000
F          0.615385
G          0.500000
T          0.000000
Unknown    0.299854
Name: Survived, dtype: float64

In [65]:
px.histogram(titanic_df, x='Deck', color='Survived')

In [66]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(titanic_df[['Deck']])
encoded_cols = encoder.get_feature_names_out(['Deck'])
encoded_cols

array(['Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F',
       'Deck_G', 'Deck_T', 'Deck_Unknown'], dtype=object)

In [67]:
titanic_df[encoded_cols]=encoder.transform(titanic_df[['Deck']])
test_df[encoded_cols]=encoder.transform(test_df[['Deck']])

In [68]:
titanic_df=titanic_df.drop(columns=['Deck'])
test_df=test_df.drop(columns=['Deck'])

In [69]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Embarked_C,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,892,3,34.5,330911,2.178064,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,893,3,47.0,363272,2.079442,0,1,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,894,2,62.0,240276,2.369075,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,895,3,27.0,315154,2.268252,1,0,1,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,896,3,22.0,3101298,2.586824,0,1,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [70]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'Ticket', 'Fare', 'Sex_bin',
       'name_bin', 'cabin_bin', 'is_alone', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown'],
      dtype='object')

In [71]:
input_cols = ['Pclass', 'Age','Fare', 'Sex_bin',
       'name_bin', 'cabin_bin', 'is_alone', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown']
target_col = 'Survived'

In [72]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,is_alone,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,1,0,3,22.0,A/5 21171,2.110213,1,0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,38.0,PC 17599,4.280593,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,26.0,STON/O2. 3101282,2.188856,0,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,35.0,113803,3.990834,0,1,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,35.0,373450,2.202765,1,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [73]:
encode = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(titanic_df[['Pclass']])
encoded_cols = encoder.get_feature_names_out(['Pclass'])
encoded_cols

array(['Pclass_1', 'Pclass_2', 'Pclass_3'], dtype=object)

In [74]:
titanic_df[encoded_cols] = encoder.transform(titanic_df[['Pclass']])

In [75]:
test_df[encoded_cols] = encoder.transform(test_df[['Pclass']])

In [76]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Embarked_C,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Pclass_1,Pclass_2,Pclass_3
0,892,3,34.5,330911,2.178064,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,893,3,47.0,363272,2.079442,0,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,894,2,62.0,240276,2.369075,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,895,3,27.0,315154,2.268252,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,896,3,22.0,3101298,2.586824,0,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [77]:
input_cols = ['Pclass_1','Pclass_2','Pclass_3', 'Age','Fare', 'Sex_bin',
       'name_bin', 'cabin_bin', 'is_alone', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown']
target_col = 'Survived'

# Dealing with Age Column

In [78]:
from sklearn.preprocessing import MinMaxScaler

In [79]:
scaler = MinMaxScaler()

In [80]:
scaler.fit(titanic_df[['Age']])

In [81]:
scaler.data_max_

array([80.])

In [82]:
titanic_df.Age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 889, dtype: float64

In [83]:
titanic_df['Age'] = scaler.transform(titanic_df[['Age']])

In [84]:
titanic_df.Age

0      0.271174
1      0.472229
2      0.321438
3      0.434531
4      0.434531
         ...   
886    0.334004
887    0.233476
888    0.346569
889    0.321438
890    0.396833
Name: Age, Length: 889, dtype: float64

In [85]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   889 non-null    int64  
 1   Survived      889 non-null    int64  
 2   Pclass        889 non-null    int64  
 3   Age           889 non-null    float64
 4   Ticket        889 non-null    object 
 5   Fare          889 non-null    float64
 6   Sex_bin       889 non-null    int64  
 7   name_bin      889 non-null    int64  
 8   cabin_bin     889 non-null    int64  
 9   is_alone      889 non-null    int64  
 10  Embarked_C    889 non-null    float64
 11  Embarked_Q    889 non-null    float64
 12  Embarked_S    889 non-null    float64
 13  Deck_A        889 non-null    float64
 14  Deck_B        889 non-null    float64
 15  Deck_C        889 non-null    float64
 16  Deck_D        889 non-null    float64
 17  Deck_E        889 non-null    float64
 18  Deck_F        889 non-null    float

In [86]:
test_df['Age'] = scaler.transform(test_df[['Age']])

In [87]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Embarked_C,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Pclass_1,Pclass_2,Pclass_3
0,892,3,0.428248,330911,2.178064,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,893,3,0.585323,363272,2.079442,0,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,894,2,0.773813,240276,2.369075,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,895,3,0.334004,315154,2.268252,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,896,3,0.271174,3101298,2.586824,0,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [88]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Ticket,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Embarked_C,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Pclass_1,Pclass_2,Pclass_3
0,892,3,0.428248,330911,2.178064,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,893,3,0.585323,363272,2.079442,0,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,894,2,0.773813,240276,2.369075,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,895,3,0.334004,315154,2.268252,1,0,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,896,3,0.271174,3101298,2.586824,0,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


# Train Model

In [89]:
test_id = test_df['PassengerId']
test_id

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [90]:
test_df.isna().sum()

PassengerId     0
Pclass          0
Age             0
Ticket          0
Fare            0
Sex_bin         0
name_bin        0
cabin_bin       0
is_alone        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Deck_A          0
Deck_B          0
Deck_C          0
Deck_D          0
Deck_E          0
Deck_F          0
Deck_G          0
Deck_T          0
Deck_Unknown    0
Pclass_1        0
Pclass_2        0
Pclass_3        0
dtype: int64

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [92]:
titanic_df[input_cols]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Age,Fare,Sex_bin,name_bin,cabin_bin,is_alone,Embarked_C,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,0.0,0.0,1.0,0.271174,2.110213,1,0,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.472229,4.280593,0,1,0,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.321438,2.188856,0,0,1,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.434531,3.990834,0,1,0,0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.434531,2.202765,1,0,1,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.334004,2.639057,1,0,1,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1.0,0.0,0.0,0.233476,3.433987,0,0,0,1,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,0.0,0.0,1.0,0.346569,3.196630,0,0,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
889,1.0,0.0,0.0,0.321438,3.433987,1,0,0,1,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(titanic_df[input_cols], titanic_df[target_col])

In [94]:
cv_score = cross_val_score(model, titanic_df[input_cols], titanic_df[target_col], cv=5).mean()
print("Cross-validation accuracy:", cv_score)

Cross-validation accuracy: 0.8042722021202311


In [95]:
test_predict = model.predict(test_df[input_cols])
test_predict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [96]:
submission = pd.DataFrame({
    'PassengerId': test_id,
    'Survived': test_predict
})
submission.to_csv('/kaggle/working/2nd_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
