# AHUMUZA ARIYO NIMUSIIMA
# 21/U/1657
# 2100701657

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import category_encoders as ce

In [34]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
df.drop(['Name', 'Ticket', 'PassengerId'], 
       axis = 1, inplace = True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [37]:
df['Cabin'].nunique()

147

In [38]:
df['Embarked'].nunique()

3

In [39]:
df['Sex'].nunique()

2

In [40]:
missing_values = df.isnull().sum()
print(missing_values)

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64


Since the Cabin column has some missing values and yet it will still be crucial to the ML model we are creating, let us create a category for Passengers with a missing value in the Cabin column

To handle the age, since it is not very significant we shall find the mean value of all the ages and fill the missing values with that

For embarked however, we shall delete the rows with missing values since they are insignificant (2)

In [41]:
df['Cabin'].fillna('Unknown', inplace=True)
average_age = df['Age'].mean()
df['Age'].fillna(average_age, inplace=True)
df.dropna(subset=['Embarked'], inplace=True)

In [42]:
missing_values = df.isnull().sum()
print(missing_values)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64


Since there are multiple categories for the Cabin column, we shall use hashing encoding

In [43]:
hashing_encoder = ce.HashingEncoder(n_components=5)
hashed_features = hashing_encoder.fit_transform(df['Cabin'])
df = pd.concat([df.drop(columns=['Cabin']), hashed_features], axis=1)

In [44]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,col_0,col_1,col_2,col_3,col_4
0,0,3,male,22.0,1,0,7.25,S,0,0,0,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0,1
2,1,3,female,26.0,0,0,7.925,S,0,0,0,0,1
3,1,1,female,35.0,1,0,53.1,S,1,0,0,0,0
4,0,3,male,35.0,0,0,8.05,S,0,0,0,0,1


In [45]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, 0].values

In [46]:
print(X)

[[3 'male' 22.0 ... 0 0 0]
 [1 'female' 38.0 ... 0 0 0]
 [3 'female' 26.0 ... 0 0 0]
 ...
 [3 'female' 29.69911764705882 ... 0 0 0]
 [1 'male' 26.0 ... 0 1 0]
 [3 'male' 32.0 ... 0 0 0]]


In [47]:
print(y)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0
 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0
 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1
 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 

Encoding Categorical Data (Gender and Embarked column)

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X[:, 1] = label_encoder.fit_transform(X[:, 1])

In [51]:
ct = ColumnTransformer(transformers=[('encoder2', OneHotEncoder(), [6])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [52]:
print(X)

[[0.0 0.0 1.0 ... 0 0 0]
 [1.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 1.0 ... 0 0 0]
 ...
 [0.0 0.0 1.0 ... 0 0 0]
 [1.0 0.0 0.0 ... 0 1 0]
 [0.0 1.0 0.0 ... 0 0 0]]


In [53]:
df_X = pd.DataFrame(X)
df_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,1.0,3,1,22.0,1,0,7.25,0,0,0,0
1,1.0,0.0,0.0,1,0,38.0,1,0,71.2833,0,0,0,0
2,0.0,0.0,1.0,3,0,26.0,0,0,7.925,0,0,0,0
3,0.0,0.0,1.0,1,0,35.0,1,0,53.1,1,0,0,0
4,0.0,0.0,1.0,3,1,35.0,0,0,8.05,0,0,0,0


In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Feature Scaling

In [59]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
columns_to_scale = [5, 8]
sc = StandardScaler()

# Fit the scaler on the training data and transform the specified columns
X_train[:, columns_to_scale] = sc.fit_transform(X_train[:, columns_to_scale])

# Transform the same columns in the test data using the scaler fitted on the training data
X_test[:, columns_to_scale] = sc.transform(X_test[:, columns_to_scale])

Using Random Forest Regression to build a model

In [76]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [77]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[84 25]
 [16 53]]


0.7696629213483146