In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv("train.csv")

In [3]:
print('Train Data')
train_data.head()

Train Data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
def impute_age(dataset):
    data = dataset.copy()
    for i in range(len(data)):
        if pd.isnull(data['Age'][i]):
            if data['Pclass'][i] == 1:
                data.loc[i, 'Age'] = data[data['Pclass'] == 1]['Age'].mean()
            elif data['Pclass'][i] == 2:
                data.loc[i, 'Age'] = data[data['Pclass'] == 2]['Age'].mean()
            elif data['Pclass'][i] == 3:
                data.loc[i, 'Age'] = data[data['Pclass'] == 3]['Age'].mean()
    return data

def impute_cols(dataset):
    data =  dataset.copy()
    data = impute_age(data)
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())
    data['Cabin'] = data['Cabin'].fillna('None')
    return data

def handle_categorical_cols(dataset):
    data = dataset.copy()
    dummy_sex = pd.get_dummies(data['Sex'], drop_first=True, dtype=int)
    dummy_embarked = pd.get_dummies(data['Embarked'], drop_first=True, dtype=int, prefix='Embarked', prefix_sep='_')
    data = pd.concat([data, dummy_sex, dummy_embarked], axis=1)
    data = data.drop(columns=['Sex', 'Embarked'])
    return data

def  modify_names(Name):
    Name = Name.split(',')
    Surname = Name[0].strip()
    return Surname

def handle_names(dataset):
    data = dataset.copy()
    data['Name'] = data['Name'].apply(modify_names)
    dummy_names = pd.get_dummies(data['Name'], dtype=int, prefix='Surname', prefix_sep='_')
    data = pd.concat([data, dummy_names], axis=1)
    data = data.drop('Name', axis=1)
    return data

def handle_cabins(dataset):
    data = dataset.copy()
    dummy_cabins = pd.get_dummies(data['Cabin'], dtype=int, prefix='Cabin', prefix_sep='_')
    data = pd.concat([data, dummy_cabins], axis=1)
    data = data.drop('Cabin', axis=1)
    return data

def feature_engineering(dataset):
    data = dataset.copy()
    data['FamilySize']=data['SibSp'] + data['Parch']
    data = data.drop(columns=['SibSp', 'Parch'])
    data = handle_names(data)
    data = handle_cabins(data)
    data = data.drop('Ticket', axis=1)
    return data

def pipeline(dataset):
    data = dataset.copy()
    data = impute_cols(data)
    data = handle_categorical_cols(data)
    data = feature_engineering(data)
    return data

In [6]:
train_data = pipeline(train_data)

In [7]:
train_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
Fare           0
              ..
Cabin_F38      0
Cabin_F4       0
Cabin_G6       0
Cabin_None     0
Cabin_T        0
Length: 824, dtype: int64

In [8]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,male,Embarked_Q,Embarked_S,FamilySize,Surname_Abbing,...,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_None,Cabin_T
0,1,0,3,22.0,7.25,1,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,1,38.0,71.2833,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,26.0,7.925,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,1,1,35.0,53.1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,3,35.0,8.05,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 824 entries, PassengerId to Cabin_T
dtypes: float64(2), int32(818), int64(4)
memory usage: 2.8 MB


In [10]:
X = train_data.drop(columns=['PassengerId', 'Survived'])
y = train_data['Survived']
all_features = X.columns

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)


model = RandomForestClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy_score(y_pred, y_test)

0.7898305084745763

In [12]:
confusion_matrix(y_pred, y_test)

array([[159,  52],
       [ 10,  74]], dtype=int64)

In [13]:
from sklearn.model_selection import KFold, cross_val_score

k_folds = KFold(n_splits = 5)

scores = cross_val_score(model, X, y, cv = k_folds)
scores*100

array([82.12290503, 79.7752809 , 88.20224719, 79.21348315, 86.51685393])

In [14]:
avg_score = np.average(scores*100)
avg_score

83.16615403929445

In [15]:
model.fit(X,y)

#  -----------------------------------------------------------------------------------------

# Prediction for test data and Submission

In [16]:
test_data = pd.read_csv("test.csv")

In [17]:
print('Test Data')
test_data.head()

Test Data


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [19]:
test_data = pipeline(test_data)

In [20]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,male,Embarked_Q,Embarked_S,FamilySize,Surname_Abbott,Surname_Abelseth,...,Cabin_E60,Cabin_F,Cabin_F E46,Cabin_F E57,Cabin_F G63,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Cabin_None
0,892,3,34.5,7.8292,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,893,3,47.0,7.0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,894,2,62.0,9.6875,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,895,3,27.0,8.6625,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,896,3,22.0,12.2875,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
test_data.isnull().sum()

PassengerId    0
Pclass         0
Age            0
Fare           0
male           0
              ..
Cabin_F2       0
Cabin_F33      0
Cabin_F4       0
Cabin_G6       0
Cabin_None     0
Length: 437, dtype: int64

In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Columns: 437 entries, PassengerId to Cabin_None
dtypes: float64(2), int32(432), int64(3)
memory usage: 721.8 KB


In [23]:
test_X = test_data.drop(columns=['PassengerId'])
test_X

Unnamed: 0,Pclass,Age,Fare,male,Embarked_Q,Embarked_S,FamilySize,Surname_Abbott,Surname_Abelseth,Surname_Abrahamsson,...,Cabin_E60,Cabin_F,Cabin_F E46,Cabin_F E57,Cabin_F G63,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Cabin_None
0,3,34.500000,7.8292,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,47.000000,7.0000,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,62.000000,9.6875,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,27.000000,8.6625,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3,22.000000,12.2875,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,24.027945,8.0500,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
414,1,39.000000,108.9000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
415,3,38.500000,7.2500,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
416,3,24.027945,8.0500,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [24]:
missing_features = [feature for feature in all_features if feature not in test_X.columns]
missing_data = pd.DataFrame(0, index=test_X.index, columns=missing_features)
test_X = pd.concat([test_X, missing_data], axis=1)

for feature in test_X.columns:
    if feature not in all_features:
        test_X.drop(feature, axis=1, inplace=True)

In [25]:
List = []
for feature in all_features:
    index = test_X.columns.get_loc(feature)
    List.append(index)
test_X = test_X.iloc[:, List]
test_X

Unnamed: 0,Pclass,Age,Fare,male,Embarked_Q,Embarked_S,FamilySize,Surname_Abbing,Surname_Abbott,Surname_Abelson,...,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_None,Cabin_T
0,3,34.500000,7.8292,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3,47.000000,7.0000,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,62.000000,9.6875,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,27.000000,8.6625,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,22.000000,12.2875,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,24.027945,8.0500,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
414,1,39.000000,108.9000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
415,3,38.500000,7.2500,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
416,3,24.027945,8.0500,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [26]:
test_y = model.predict(test_X)

In [27]:
test_data['Survived'] = test_y

In [28]:
submission_data = test_data[['PassengerId', 'Survived']]
submission_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [29]:
submission_data.to_csv('submission_random_forest.csv', index=False)