# ***Titanic - Machine Learning from Disaster***

# **Introduction**

#### Importing the Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings(action = 'ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


### Load Dataset

In [2]:
train_df= pd.read_csv("train.csv")
test_df= pd.read_csv("test.csv")
sub_df =pd.read_csv("gender_submission.csv")

In [3]:
#print 1st 5 rows of train
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_df.head(5)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
sub_df.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [6]:
sub_df.describe().round

<bound method DataFrame.round of        PassengerId    Survived
count   418.000000  418.000000
mean   1100.500000    0.363636
std     120.810458    0.481622
min     892.000000    0.000000
25%     996.250000    0.000000
50%    1100.500000    0.000000
75%    1204.750000    1.000000
max    1309.000000    1.000000>

In [7]:
sub_df.shape

(418, 2)

In [8]:
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [9]:
train_df.describe().round

<bound method DataFrame.round of        PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  >

In [10]:
train_df.shape

(891, 12)

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Data Preprocessing of Train dataset

In [12]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
# Fill missing Age values with the median age
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())


In [14]:
# replace missing values in the "cabin" column with "missing"
train_df["Cabin"] = train_df["Cabin"].fillna("missing")


In [15]:
# find the most frequent non-missing value in the "Embarked" column
most_frequent = train_df["Embarked"].mode()[0]

# replace missing values in the "Embarked" column with the most frequent non-missing value
train_df["Embarked"] = train_df["Embarked"].fillna(most_frequent)


In [16]:
train_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Now, the train data has no missing value left, let's do the encoding the data  

In [17]:
# create dummy variables for the "Embarked" column
Embarked_dummy = pd.get_dummies(train_df['Embarked'])
# concatenate the dataframes into original dataframes.
train_df = pd.concat([train_df,Embarked_dummy], axis=1)
train_df.drop('Embarked', axis =1, inplace = True)

In [18]:
# create dummy variables for the "Embarked" column
Sex_dummy = pd.get_dummies(train_df['Sex'])
# concatenate the dataframes into original dataframes.
train_df = pd.concat([train_df,Sex_dummy], axis=1)
train_df.drop('Sex', axis =1, inplace = True)

In [19]:
# convert the "Age" column from float to integer
train_df["Age"] = train_df["Age"].astype(int)


In [20]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,female,male
0,1,0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.25,missing,0,0,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,1,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.925,missing,0,0,1,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1,C123,0,0,1,1,0
4,5,0,3,"Allen, Mr. William Henry",35,0,0,373450,8.05,missing,0,0,1,0,1


### Data Preprocessing of Test dataset

In [21]:
test_df.describe().round

<bound method DataFrame.round of        PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800
50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200
75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200>

In [22]:
test_df.shape

(418, 11)

In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [24]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [25]:
# Fill missing Age values with the median age
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

In [26]:
# convert the "Age" column from float to integer
test_df["Age"] = test_df["Age"].astype(int)


In [27]:
# replace missing values in the "cabin" column with "missing"
test_df["Cabin"] = test_df["Cabin"].fillna("missing")

In [28]:
# find the most frequent non-missing value in the "Embarked" column
most_frequent = test_df["Embarked"].mode()[0]

# replace missing values in the "Embarked" column with the most frequent non-missing value
test_df["Embarked"] = test_df["Embarked"].fillna(most_frequent)


In [29]:
# find the mean value of the "Fare" column in the training dataset
fare_mean = train_df["Fare"].mean()
# replace the missing value in the "Fare" column of the test dataset with the mean value from the training dataset
test_df["Fare"].fillna(fare_mean, inplace=True)

In [30]:
test_df.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Now, the test data has no missing value left, let's do the encoding the data



In [31]:
# create dummy variables for the "Embarked" column
Embarked_dummy = pd.get_dummies(test_df['Embarked'])
# concatenate the dataframes into original dataframes.
test_df = pd.concat([test_df,Embarked_dummy], axis=1)
test_df.drop('Embarked', axis =1, inplace = True)



In [32]:
# create dummy variables for the "Embarked" column
Sex_dummy = pd.get_dummies(test_df['Sex'])
# concatenate the dataframes into original dataframes.
test_df = pd.concat([test_df,Sex_dummy], axis=1)
test_df.drop('Sex', axis =1, inplace = True)

In [33]:
test_df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,female,male
0,892,3,"Kelly, Mr. James",34,0,0,330911,7.8292,missing,0,1,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47,1,0,363272,7.0,missing,0,0,1,1,0
2,894,2,"Myles, Mr. Thomas Francis",62,0,0,240276,9.6875,missing,0,1,0,0,1
3,895,3,"Wirz, Mr. Albert",27,0,0,315154,8.6625,missing,0,0,1,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22,1,1,3101298,12.2875,missing,0,0,1,1,0


In [34]:
# drop the "Name" column from the train_df and test_df dataframes
train_df = train_df.drop("Name", axis=1)
test_df = test_df.drop("Name", axis=1)


In [35]:

# extract the ticket prefix from the "Ticket" column and create a new "TicketPrefix" feature
train_df["TicketPrefix"] = train_df["Ticket"].str.extract('([A-Za-z]+)', expand=False)
test_df["TicketPrefix"] = test_df["Ticket"].str.extract('([A-Za-z]+)', expand=False)

# drop the "Ticket" column
train_df = train_df.drop("Ticket", axis=1)
test_df = test_df.drop("Ticket", axis=1)

# convert the "TicketPrefix" column into numerical values
ticket_prefix_mapping = {"A": 1, "W": 2, "F": 3, "L": 4, "S": 5, "P": 6, "C": 7, "M": 8}
train_df["TicketPrefix"] = train_df["TicketPrefix"].map(ticket_prefix_mapping).fillna(0)
test_df["TicketPrefix"] = test_df["TicketPrefix"].map(ticket_prefix_mapping).fillna(0)

In [36]:
# extract the deck level from the "Cabin" column and create a new "DeckLevel" feature
train_df["DeckLevel"] = train_df["Cabin"].str.extract('([A-Za-z])', expand=False)
test_df["DeckLevel"] = test_df["Cabin"].str.extract('([A-Za-z])', expand=False)

# drop the "Cabin" column
train_df = train_df.drop("Cabin", axis=1)
test_df = test_df.drop("Cabin", axis=1)

# convert the "DeckLevel" column into numerical values
deck_level_mapping = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
train_df["DeckLevel"] = train_df["DeckLevel"].map(deck_level_mapping).fillna(0)
test_df["DeckLevel"] = test_df["DeckLevel"].map(deck_level_mapping).fillna(0)

## Modal Development

In [37]:
# split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df.drop("Survived", axis=1), train_df["Survived"], test_size=0.2, random_state=42)

In [38]:
# train a Random Forest classifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rfc.fit(X_train, y_train)

In [39]:
# make predictions on the validation set
y_pred = rfc.predict(X_val)


In [40]:
# evaluate the accuracy of the model
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7932960893854749


In [41]:
# make predictions on the test set
test_df["Survived"] = rfc.predict(test_df)

In [42]:
# save the predictions to a CSV file
test_df[["PassengerId", "Survived"]].to_csv("submission.csv", index=False)

# **Thanks**