# Random Forest VS SVC

Titanic dataset

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer

# Locate and load the data file
df_train = pd.read_csv("./EP_datasets/titanic/train.csv")
df_test = pd.read_csv("./EP_datasets/titanic/test.csv")

print(f"#rows={len(df_train)} #columns={len(df_train.columns)}")

# Print some info and plots to get a feeling about the dataset
print(df_train.dtypes)

#rows=891 #columns=12
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [2]:
# Check for duplicates, this adds a new column to the dataset
df_train["is_duplicate"] = df_train.duplicated()

# Note that when using f-strings, the internal quote character must be different, such as 'is_duplicate' above
print(f"#total= {len(df_train)}")
print(f"#duplicated= {len(df_train[df_train['is_duplicate']==True])}")

#total= 891
#duplicated= 0


In [3]:
# remove unnecessary features
# remove ticket number
# remove cabin number
# remove embarked

df_train = df_train.drop(
    columns=["Ticket", "Cabin", "Embarked", "is_duplicate", "Name"]
)
df_test = df_test.drop(columns=["Ticket", "Cabin", "Embarked", "Name"])

In [4]:
# check of null values
df_test.isnull().any()

PassengerId    False
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare            True
dtype: bool

In [5]:
# relace nulls in age with mean age - Training Data
mean = df_train["Age"].mean()
mean
df_train["Age"] = df_train["Age"].fillna(mean)

# relace nulls in age with mean age - Test Data
mean_age = df_test["Age"].mean()
mean_fare = df_test["Fare"].mean()
df_test["Age"] = df_test["Age"].fillna(mean_age)
df_test["Fare"] = df_test["Fare"].fillna(mean_fare)
df_test.isnull().any()

PassengerId    False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
dtype: bool

In [6]:
# check for data types
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
dtype: object

In [7]:
# Check unique levels and see any marker is used for a missing level
for col in df_test.columns:
    if df_test[col].dtype == object:
        print(col, df_test[col].unique())

Sex ['male' 'female']


In [8]:
# one hot encode values in data
def encode_onehot(_df, f):
    _df2 = (
        pd.get_dummies(_df[f], prefix="", prefix_sep="")
        .groupby(level=0, axis=1)
        .max()
        .add_prefix(f + " - ")
    )
    df3 = pd.concat([_df, _df2], axis=1)
    df3 = df3.drop([f], axis=1)
    return df3

In [9]:
# one hot encode Sex of passengers
df_o = encode_onehot(df_train, "Sex")
df_o_test = encode_onehot(df_test, "Sex")

cols = []
for f in list(df_o.columns.values):
    if "Sex" in f:
        cols += [f]
cols = []
for f in list(df_o_test.columns.values):
    if "Sex" in f:
        cols += [f]

  pd.get_dummies(_df[f], prefix="", prefix_sep="")
  pd.get_dummies(_df[f], prefix="", prefix_sep="")


In [10]:
display(df_o_test[cols][:10])

Unnamed: 0,Sex - female,Sex - male
0,False,True
1,True,False
2,False,True
3,False,True
4,True,False
5,False,True
6,True,False
7,False,True
8,True,False
9,False,True


In [11]:
df_o_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex - female,Sex - male
0,892,3,34.5,0,0,7.8292,False,True
1,893,3,47.0,1,0,7.0,True,False
2,894,2,62.0,0,0,9.6875,False,True
3,895,3,27.0,0,0,8.6625,False,True
4,896,3,22.0,1,1,12.2875,True,False


In [12]:
# drop male column since the same data is represented by female column
df_o = df_o.drop(columns=["Sex - male"])
df_o_test = df_o_test.drop(columns=["Sex - male"])

In [13]:
# prepare the input X matrix and the target y vector
X = df_o.loc[
    :, np.logical_and(df_o.columns != "Survived", df_o.columns != "PassengerId")
]
y = df_o.loc[:, df_o.columns == "Survived"].values.ravel()
X_test = df_o_test.loc[
    :,
    np.logical_and(df_o_test.columns != "Survived", df_o_test.columns != "PassengerId"),
]

In [14]:
X_test.isnull().any()

Pclass          False
Age             False
SibSp           False
Parch           False
Fare            False
Sex - female    False
dtype: bool

In [15]:
def save_preds(_fn, _y_pred, _df):
    import csv

    with open(_fn, "w") as fout:
        writer = csv.writer(fout, delimiter=",", lineterminator="\n")
        writer.writerow(["Survived", "PassengerId"])
        for y, passengerId in zip(_y_pred, _df["PassengerId"]):
            writer.writerow([y, passengerId])

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [17]:
%%time

forest = RandomForestClassifier(random_state=1)
forest.fit(X, y)

y_pred = forest.predict(X_test)
save_preds("results/predictions_forest.csv", y_pred, df_test)

CPU times: user 114 ms, sys: 113 µs, total: 115 ms
Wall time: 114 ms


In [18]:
%%time

svc = SVC(kernel="linear")
svc.fit(X, y)

y_pred = svc.predict(X_test)
save_preds("results/predictions_svc.csv", y_pred, df_test)

CPU times: user 4.25 s, sys: 0 ns, total: 4.25 s
Wall time: 4.25 s
