In [None]:
!unzip datasets.zip
!unzip models.zip

In [1]:
# Load in some packages
import pandas as pd
from sklearn.model_selection import train_test_split
from models.supervised.classification.random_forest_classifier import RandomForestClassifierModel

# Constant values
RANDOM = 0

# **Prepare Dataset**

In order to prepare the dataset, we need to load the cleaned processed dataset that we created.

In [2]:
data = pd.read_csv('datasets/processed/titanic_final.csv') # Read the data
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,Died,Class3,"Braund, Mr. Owen Harris",male,22.000000,1,0,-0.897655,n,S,1
1,Survived,Class1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,1.343783,C,C,1
2,Survived,Class3,"Heikkinen, Miss. Laina",female,26.000000,0,0,-0.816437,n,S,0
3,Survived,Class1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,1.044537,C,S,1
4,Died,Class3,"Allen, Mr. William Henry",male,35.000000,0,0,-0.802073,n,S,0
...,...,...,...,...,...,...,...,...,...,...,...
1304,,Class3,"Spector, Mr. Woolf",male,29.881138,0,0,-0.802073,n,S,0
1305,,Class1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,1.776478,C,C,0
1306,,Class3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,-0.897655,n,S,0
1307,,Class3,"Ware, Mr. Frederick",male,29.881138,0,0,-0.802073,n,S,0


Drop any 'Nan' value and columns that can cause the model to get error in the middle of the training and replace any categorical features.

In [3]:
data.dropna(inplace = True)
del data['Name']

data['Survived'].replace(['Died', 'Survived'], [0, 1], inplace=True)
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Pclass'].replace(['Class1', 'Class2', 'Class3'], [1, 2, 3], inplace=True)

categorical = data.dtypes[data.dtypes == "object"].index
for column in categorical:
    data[column] = pd.factorize(data[str(column)])[0]

data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,0,3,0,22.000000,1,0,-0.897655,0,0,1
1,1,1,1,38.000000,1,0,1.343783,1,1,1
2,1,3,1,26.000000,0,0,-0.816437,0,0,0
3,1,1,1,35.000000,1,0,1.044537,1,0,1
4,0,3,0,35.000000,0,0,-0.802073,0,0,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,-0.351496,0,0,0
887,1,1,1,19.000000,0,0,0.469459,6,0,0
888,0,3,1,29.881138,1,2,0.224331,0,0,3
889,1,1,0,26.000000,0,0,0.469459,1,1,0


We need to split the independent features for the dependent one, aka the tagret:

In [4]:
X = data.drop(['Survived'], axis = 1)
y = data['Survived']

In order to train and test the model, we need to split the data into training and testing ones. We will do it using a function from scikit-learn the train_test_split, which splits arrays or matrices into random train and test subsets:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM)

# **Load Model**

Load a classification model in order to train and predict.

In [9]:
model = RandomForestClassifierModel('Titanic', X_train, y_train, X_test, y_test, RANDOM)
results_train = model.train()
results_test = model.test()


----------- Training on 666 samples with Random Forest Classifier model -----------

{'accuracy_score': '98.8 %',
 'f1_score': '98.37 %',
 'precision_score': '99.59 %',
 'recall_score': '97.19 %'}

----------- Testing on 223 samples with Random Forest Classifier model -----------

{'accuracy_score': '79.37 %',
 'f1_score': '73.56 %',
 'precision_score': '77.11 %',
 'recall_score': '70.33 %'}
