In [14]:
import joblib
import numpy as np
import os
import pandas as pd
import sklearn
import sys

from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Library versions
print("python version:", sys.version)
print("joblib version:", joblib.__version__)
print("pandas version:", pd.__version__)
print("sklearn version:", sklearn.__version__)

# Data version
version = 20230228
data_file = "train.csv"

# More variables
prefix = "titanic_survival"
data_path = Path("data")
data_path.mkdir(exist_ok=True)
model_path = Path(f"../app/{prefix}") / f"model-{version}.joblib"
dep_col = "Survived"
important_feature_threshold = 0.005

python version: 3.11.2 (main, Feb 13 2023, 03:31:04) [Clang 14.0.0 (clang-1400.0.29.202)]
joblib version: 1.2.0
pandas version: 1.5.3
sklearn version: 1.2.1


In [15]:
df = pd.read_csv(data_path / "train.csv")
modes = df.mode().iloc[0]

In [16]:
def preprocess_data(df, test=False):
    df["NameLength"] = df.Name.apply(len)
    df["HasCabin"] = df.Cabin.apply(lambda x: 0 if type(x) == float else 1)
    df["IsFemale"] = df.Sex == "female"
    drop_cols = ["Name", "Ticket", "Cabin", "Embarked", "PassengerId", "Sex"]
    df = df.drop(drop_cols, axis=1)

    df["Fare"] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    # Reorder columns alphabetically
    df = df.reindex(sorted(df.columns), axis=1)
    return df

In [17]:
df = preprocess_data(df)

# Inspect data
print("Data rows:", df.shape[0])
print("Features:", df.shape[1])
print("Columns:", df.columns.sort_values())
df.head()

Data rows: 891
Features: 9
Columns: Index(['Age', 'Fare', 'HasCabin', 'IsFemale', 'NameLength', 'Parch', 'Pclass',
       'SibSp', 'Survived'],
      dtype='object')


Unnamed: 0,Age,Fare,HasCabin,IsFemale,NameLength,Parch,Pclass,SibSp,Survived
0,22.0,7.25,0,False,23,0,3,1,0
1,38.0,71.2833,1,True,51,0,1,1,1
2,26.0,7.925,0,True,22,0,3,0,1
3,35.0,53.1,1,True,44,0,1,1,1
4,35.0,8.05,0,False,24,0,3,0,0


In [18]:
# Train model
X_train, X_test, y_train, y_test = train_test_split(
    df.drop([dep_col], axis=1), df[dep_col], test_size=0.2
)
clf = RandomForestRegressor(n_estimators=100)
clf.fit(X_train, y_train)

# Test model
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred.round()))
# Simply rounding the predictions to 0 or 1
pd.DataFrame(
    confusion_matrix(y_test, y_pred.round()),
    columns=["Predicted Not Survived", "Predicted Survived"],
    index=["Actual Not Survived", "Actual Survived"],
)

Accuracy: 0.8268156424581006


Unnamed: 0,Predicted Not Survived,Predicted Survived
Actual Not Survived,105,12
Actual Survived,19,43


In [19]:
# Feature importance
def rf_feat_importance(m, df):
    return pd.DataFrame(
        {"cols": df.columns, "imp": m.feature_importances_}
    ).sort_values("imp", ascending=False)


fi = rf_feat_importance(clf, X_train)
# All greater than 0.005
important_features = fi[fi.imp > important_feature_threshold]
important_features

Unnamed: 0,cols,imp
3,IsFemale,0.298716
0,Age,0.196312
1,Fare,0.189658
4,NameLength,0.146443
6,Pclass,0.085766
7,SibSp,0.039348
2,HasCabin,0.027237
5,Parch,0.01652


In [20]:
unimportant_features = fi[fi.imp <= important_feature_threshold]
unimportant_features

Unnamed: 0,cols,imp


In [21]:
# test_df = pd.read_csv(data_path / "test.csv")
# test_df = preprocess_data(test_df)
# test_df

In [22]:
# test_df["Survived"] = clf.predict(test_df).round().astype(int)
# test_df

In [23]:
joblib.dump(clf, model_path)

['../app/titanic_survival/model-20230228.joblib']

In [24]:
# Call model from file, same as in app/<prefix>/model.py for FastAPI
with open(model_path, "rb") as f:
    model = joblib.load(f)

test_df = pd.read_csv(data_path / "test.csv")

payload = test_df.iloc[0].to_dict()
payload

{'PassengerId': 892,
 'Pclass': 3,
 'Name': 'Kelly, Mr. James',
 'Sex': 'male',
 'Age': 34.5,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '330911',
 'Fare': 7.8292,
 'Cabin': nan,
 'Embarked': 'Q'}

In [25]:
df_from_payload = pd.DataFrame([payload])
# print(df_from_payload)
df_from_payload = preprocess_data(df_from_payload, test=True)
print(df_from_payload)
y_pred = model.predict(df_from_payload)
y_pred

    Age    Fare  HasCabin  IsFemale  NameLength  Parch  Pclass  SibSp
0  34.5  7.8292         0     False          16      0       3      0


array([0.11666667])