<a href="https://colab.research.google.com/github/88kHw88/Titanic-ML/blob/main/Titanic_ML_advanced_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [33]:
df = pd.read_csv('/content/drive/MyDrive/Kaggle DataSets/Datasets/Titanic/train.csv')
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# **Descriptive analysis and dealing with missing values**

In [34]:
#types of columns in this dataset
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [35]:
#shape of the model
df.shape
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')

The dataset has 891 rows and 12 columns


In [36]:
#count missing values per column
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [37]:
#Show missing values and count
#df['Cabin'].value_counts(dropna=False)
#df['Embarked'].value_counts(dropna=False)
df['Age'].value_counts(dropna=False)

NaN      177
24.00     30
22.00     27
18.00     26
28.00     25
        ... 
36.50      1
55.50      1
66.00      1
23.50      1
0.42       1
Name: Age, Length: 89, dtype: int64

In [38]:
#replace missing values
df['Cabin'] = df['Cabin'].replace({np.nan: 'Unknown'})
df['Embarked'] = df['Embarked'].replace({np.nan: 'Unknown'})

#Impute column 'Age' with median value
df['Age'] = df['Age'].fillna(df['Age'].median()) #let op .fillna(df['Age'].median())

In [39]:
#missing values are filled 
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [40]:
#Copying into a new dataset for advanced analysis
df2 = df.copy()
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S


# **Determine X (features) and y (predictor)**

In [41]:
#determine y (predictor) 
y = df2['Survived']

#and X (features)
X =  df2.drop(['Survived'], axis =1 )

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [42]:
#Test show X_train
print(X_train)
print(X_test)

     PassengerId  Pclass  ...    Cabin Embarked
120          121       2  ...  Unknown        S
296          297       3  ...  Unknown        C
1              2       1  ...      C85        C
38            39       3  ...  Unknown        S
26            27       3  ...  Unknown        C
..           ...     ...  ...      ...      ...
398          399       2  ...  Unknown        S
764          765       3  ...  Unknown        S
508          509       3  ...  Unknown        S
145          146       2  ...  Unknown        S
699          700       3  ...    F G63        S

[668 rows x 11 columns]
     PassengerId  Pclass  ...    Cabin Embarked
140          141       3  ...  Unknown        C
98            99       2  ...  Unknown        S
596          597       2  ...  Unknown        S
574          575       3  ...  Unknown        S
660          661       1  ...  Unknown        S
..           ...     ...  ...      ...      ...
456          457       1  ...      E38        S
28            2

In [43]:
#function to calculate MAE score
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

# **Categorical columns**

**Delete Categorical Columns**

In [44]:
#drop categorical values from X_train and X_test
num_X_train = X_train.select_dtypes(exclude='object')
num_X_test = X_test.select_dtypes(exclude='object')

#calculate MAE
print("MAE value by removing categorical values is:")
score_dataset(num_X_train, num_X_test, y_train, y_test)

MAE value by removing categorical values is:


0.3695964125560539

**Ordinal Encoding**

In [45]:
#select categorical columns
objects = X_train.dtypes == 'object'
#list of categorical columns
obj_columns = list(objects[objects].index)
obj_columns

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [46]:
#show df2 before OrdinalEncoding
df2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,Unknown,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [47]:
#setting up Ordinal Encoding
#categorical values are replaced with integer values 
X_ord_train = X_train.copy()
X_ord_test = X_test.copy()

#determine good columns (i.e. values in columns exist in X_test and X_train
good_cols = [col for col in obj_columns if set(X_test[col]).issubset(set(X_train[col]))]
          

#use good cols to 
ordinal_encoding = OrdinalEncoder()
X_ord_train[good_cols] = ordinal_encoding.fit_transform(X_train[good_cols])
X_ord_test[good_cols] = ordinal_encoding.transform(X_test[good_cols])

#use score_dataset to determine MAE value
print('MAE value of Ordinal Encoding:')
score_dataset(X_ord_train[good_cols], X_ord_test[good_cols], y_train, y_test)

MAE value of Ordinal Encoding:


0.3337585849353