In [1]:
# %%sh
# pip install joblib -U
# pip install pandas -U
# pip install scikit-learn -U

In [2]:
import joblib
import numpy as np
import os
import pandas as pd
import sklearn
import sys

from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Library versions
print("python version:", sys.version)
print("joblib version:", joblib.__version__)
print("pandas version:", pd.__version__)
print("sklearn version:", sklearn.__version__)

# Data version
data_file = "train.csv"

# More variables
data_path = Path("data")
data_path.mkdir(exist_ok=True)
dep_col = "Survived"

python version: 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]
joblib version: 1.2.0
pandas version: 1.5.3
sklearn version: 1.2.1


In [3]:
competition = "titanic"  # Change this to any Kaggle competition name
iskaggle = os.environ.get("KAGGLE_KERNEL_RUN_TYPE", "")

if iskaggle:
    path = Path(f"../input/{competition}")
else:
    import kaggle

    # Save data in `data/` directories, that are gitignored
    path = Path("data")

    if not path.exists():
        import zipfile

        kaggle.api.competition_download_cli(competition=competition, path=str(path))
        zipfile.ZipFile(f"{path}/{competition}.zip").extractall(path)

print(f"Ready for competition: {competition}")

Ready for competition: titanic


In [4]:
df = pd.read_csv(path / "train.csv")
test_df = pd.read_csv(path / "test.csv")
modes = df.mode().iloc[0]

In [5]:
def preprocess_data(df):
    df["NameLength"] = df.Name.apply(len)
    df["HasCabin"] = df.Cabin.apply(lambda x: 0 if type(x) == float else 1)
    drop_cols = ["Name", "Ticket", "Cabin"]
    df = df.drop(drop_cols, axis=1)

    cat_cols = ["Sex", "Embarked"]
    df = pd.get_dummies(df, columns=cat_cols)

    df["Fare"] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    return df

In [6]:
df = preprocess_data(df)

# Inspect data
print("Data rows:", df.shape[0])
print("Features:", df.shape[1])
print("Columns:", df.columns.sort_values())
df.head()

Data rows: 891
Features: 14
Columns: Index(['Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'HasCabin',
       'NameLength', 'Parch', 'PassengerId', 'Pclass', 'Sex_female',
       'Sex_male', 'SibSp', 'Survived'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,NameLength,HasCabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,23,0,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,51,1,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,22,0,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,44,1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,24,0,0,1,0,0,1


In [13]:
# Train model
X_train, X_test, y_train, y_test = train_test_split(
    df.drop([dep_col], axis=1), df[dep_col], test_size=0.2
)
clf = RandomForestRegressor(n_estimators=100)
clf.fit(X_train, y_train)

# Test model
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred.round()))
# Simply rounding the predictions to 0 or 1
pd.DataFrame(
    confusion_matrix(y_test, y_pred.round()),
    columns=["Predicted Not Survived", "Predicted Survived"],
    index=["Actual Not Survived", "Actual Survived"],
)


Accuracy: 0.7932960893854749


Unnamed: 0,Predicted Not Survived,Predicted Survived
Actual Not Survived,106,6
Actual Survived,31,36


## Example: what a goal was to limit false positives?

Instead of 0.5 threshold, can adjust it like this:

In [16]:
threshold = 0.93
y_pred_adj = [1 if y >= threshold else 0 for y in y_pred]
print("Accuracy:", accuracy_score(y_test, y_pred_adj))
# print(classification_report(y_test, y_pred_adj))
pd.DataFrame(
    confusion_matrix(y_test, y_pred_adj),
    columns=["Predicted Not Survived", "Predicted Survived"],
    index=["Actual Not Survived", "Actual Survived"],
)

Accuracy: 0.7318435754189944


Unnamed: 0,Predicted Not Survived,Predicted Survived
Actual Not Survived,112,0
Actual Survived,48,19


In [9]:
# False positives
X_test[y_pred_adj > y_test]

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,NameLength,HasCabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S


In [10]:
# False negatives
# Add regression score from y_pred to X_test
# X_test["score"] = y_pred
# X_test[y_pred_adj < y_test].sort_values("score", ascending=False)

In [18]:
# Feature importance
def rf_feat_importance(m, df):
    return pd.DataFrame(
        {"cols": df.columns, "imp": m.feature_importances_}
    ).sort_values("imp", ascending=False)


fi = rf_feat_importance(clf, X_train)
# All greater than 0.005
important_features = fi[fi.imp > 0.005]
important_features

Unnamed: 0,cols,imp
8,Sex_female,0.157695
2,Age,0.150152
5,Fare,0.147137
0,PassengerId,0.142218
9,Sex_male,0.120346
6,NameLength,0.116588
1,Pclass,0.091201
3,SibSp,0.025489
7,HasCabin,0.012296
10,Embarked_C,0.011254


In [19]:
unimportant_features = fi[fi.imp <= 0.005]
unimportant_features

Unnamed: 0,cols,imp
