### Kaggle Titanic Machine Learning Competition

In [123]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [124]:
train_df = pd.read_csv('titanic-data/train.csv')
test_df = pd.read_csv('titanic-data/test.csv')

In [125]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [126]:
train_df.drop(columns = ['PassengerId', 'Name', 'Ticket'], inplace = True)
test_df.drop(columns = ['PassengerId', 'Name', 'Ticket'], inplace = True)

In [127]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [128]:
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

In [129]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [130]:
age_imputer = SimpleImputer(strategy = 'median')
train_df[['Age']] = age_imputer.fit_transform(train_df[['Age']])
test_df[['Age']] = age_imputer.transform(test_df[['Age']])

In [131]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [132]:
train_df['Cabin'].fillna('Unknown', inplace = True)
test_df['Cabin'].fillna('Unknown', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Cabin'].fillna('Unknown', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Cabin'].fillna('Unknown', inplace = True)


In [133]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     891 non-null    object 
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [134]:
train_df['Cabin'].unique()

array(['Unknown', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [135]:
train_df['CabinDeck'] = train_df['Cabin'].str[0]
train_df['CabinDeck'] = train_df['CabinDeck'].replace('U', 'Unknown')

test_df['CabinDeck'] = test_df['Cabin'].str[0]
test_df['CabinDeck'] = test_df['CabinDeck'].replace('U', 'Unknown')

In [136]:
train_df.drop(columns = ['Cabin'], inplace = True)
test_df.drop(columns = ['Cabin'], inplace = True)

In [137]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck
0,0,3,male,22.0,1,0,7.25,S,Unknown
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,Unknown
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,Unknown


In [138]:
train_df.rename(columns = {'SibSp': 'Siblings/Spouses', 'Parch': 'Parents/Children'}, inplace = True)
test_df.rename(columns = {'SibSp': 'Siblings/Spouses', 'Parch': 'Parents/Children'}, inplace = True)

In [139]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare,Embarked,CabinDeck
0,0,3,male,22.0,1,0,7.25,S,Unknown
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,Unknown
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,Unknown


In [140]:
train_df['FamilySize'] = train_df['Siblings/Spouses'] + train_df['Parents/Children'] + 1
test_df['FamilySize'] = test_df['Siblings/Spouses'] + test_df['Parents/Children'] + 1

In [141]:
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

In [142]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare,Embarked,CabinDeck,FamilySize
0,0,3,0,22.0,1,0,7.25,S,Unknown,2
1,1,1,1,38.0,1,0,71.2833,C,C,2
2,1,3,1,26.0,0,0,7.925,S,Unknown,1
3,1,1,1,35.0,1,0,53.1,S,C,2
4,0,3,0,35.0,0,0,8.05,S,Unknown,1


In [143]:
numerical_columns = ['Pclass', 'Sex', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare', 'FamilySize']
categorical_columns = ['CabinDeck', 'Embarked']

In [144]:
encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore').fit(train_df[categorical_columns])

In [145]:
encoder.categories_

[array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Unknown'], dtype=object),
 array(['C', 'Q', 'S'], dtype=object)]

In [146]:
encoded_columns = encoder.get_feature_names_out(categorical_columns).tolist()

In [153]:
train_df[encoded_columns] = encoder.transform(train_df[categorical_columns])
test_df[encoded_columns] = encoder.transform(test_df[categorical_columns])

In [154]:
fare_imputer = SimpleImputer(strategy = 'median').fit(train_df[['Fare']])
test_df[['Fare']] = fare_imputer.transform(test_df[['Fare']])

In [155]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Pclass             418 non-null    int64  
 1   Sex                418 non-null    int64  
 2   Age                418 non-null    float64
 3   Siblings/Spouses   418 non-null    int64  
 4   Parents/Children   418 non-null    int64  
 5   Fare               418 non-null    float64
 6   Embarked           418 non-null    object 
 7   CabinDeck          418 non-null    object 
 8   FamilySize         418 non-null    int64  
 9   CabinDeck_A        418 non-null    float64
 10  CabinDeck_B        418 non-null    float64
 11  CabinDeck_C        418 non-null    float64
 12  CabinDeck_D        418 non-null    float64
 13  CabinDeck_E        418 non-null    float64
 14  CabinDeck_F        418 non-null    float64
 15  CabinDeck_G        418 non-null    float64
 16  CabinDeck_T        418 non

In [156]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Survived           891 non-null    int64  
 1   Pclass             891 non-null    int64  
 2   Sex                891 non-null    int64  
 3   Age                891 non-null    float64
 4   Siblings/Spouses   891 non-null    int64  
 5   Parents/Children   891 non-null    int64  
 6   Fare               891 non-null    float64
 7   Embarked           891 non-null    object 
 8   CabinDeck          891 non-null    object 
 9   FamilySize         891 non-null    int64  
 10  CabinDeck_A        891 non-null    float64
 11  CabinDeck_B        891 non-null    float64
 12  CabinDeck_C        891 non-null    float64
 13  CabinDeck_D        891 non-null    float64
 14  CabinDeck_E        891 non-null    float64
 15  CabinDeck_F        891 non-null    float64
 16  CabinDeck_G        891 non

In [157]:
input_columns = numerical_columns + encoded_columns
target_column = ['Survived']

In [158]:
input_columns

['Pclass',
 'Sex',
 'Age',
 'Siblings/Spouses',
 'Parents/Children',
 'Fare',
 'FamilySize',
 'CabinDeck_A',
 'CabinDeck_B',
 'CabinDeck_C',
 'CabinDeck_D',
 'CabinDeck_E',
 'CabinDeck_F',
 'CabinDeck_G',
 'CabinDeck_T',
 'CabinDeck_Unknown',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [159]:
target_column

['Survived']

In [161]:
X_train = train_df[input_columns]
y_train = train_df[target_column]
X_test = test_df[input_columns]

In [164]:
model = RandomForestClassifier(n_jobs = -1, 
                               random_state = 7,
                               n_estimators = 250,
                               max_depth = 5,
                               min_samples_split = 5,
                               min_samples_leaf = 2,
                               max_features = 'sqrt'
                               )

In [165]:
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,250
,criterion,'gini'
,max_depth,5
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(random_state = 7)

parameter_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(
    estimator = rf,
    param_grid = parameter_grid,
    cv = 3,              
    scoring = 'accuracy',
    n_jobs = -1,
    verbose = 3
)

grid.fit(X_train, y_train)

print("Best CV Accuracy:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best Parameters: {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}

In [167]:
model = RandomForestClassifier(n_jobs = -1, 
                               random_state = 7,
                               n_estimators = 250,
                               max_depth = 7,
                               min_samples_split = 8,
                               min_samples_leaf = 2,
                               max_features = 'sqrt'
                               )

In [168]:
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,250
,criterion,'gini'
,max_depth,7
,min_samples_split,8
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [169]:
y_pred = model.predict(X_test)

In [172]:
test = pd.read_csv('titanic-data/test.csv')
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],  
    'Survived': y_pred.astype(int) 
})

In [174]:
submission.to_csv('titanic-data/submission.csv', index = False)