In [None]:
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121


In [None]:
%pip install numpy pandas matplotlib seaborn scikit-learn graphviz ipywidgets fastai

In [None]:
from fastai.imports import *
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
import graphviz

np.set_printoptions(linewidth=130)

In [None]:
path = Path('/notebooks/Titanic')  # Update this path

In [None]:
df = pd.read_csv(path/'train.csv')
tst_df = pd.read_csv(path/'test.csv')
modes = df.mode().iloc[0]

In [None]:
def proc_data(df):
    df['Fare'] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    df['LogFare'] = np.log1p(df['Fare'])
    df['Embarked'] = pd.Categorical(df.Embarked)
    df['Sex'] = pd.Categorical(df.Sex)

proc_data(df)
proc_data(tst_df)

In [None]:
cats = ["Sex", "Embarked"]
conts = ['Age', 'SibSp', 'Parch', 'LogFare', "Pclass"]
dep = "Survived"

In [None]:
random.seed(42)
trn_df, val_df = train_test_split(df, test_size=0.25)

In [None]:
def score(col, y, split):
    lhs = col <= split
    rhs = col > split
    return (lhs.sum() * y[lhs].std() + rhs.sum() * y[rhs].std()) / len(y)

# Example of binary split on 'Sex'
score(trn_df["Sex"].cat.codes, trn_df[dep], 0.5)


In [None]:
def xs_y(df):
    df_transformed = df.copy()
    for cat in cats:
        df_transformed[cat] = df[cat].astype('category').cat.codes
    xs = df_transformed[cats + conts]
    return xs, df_transformed[dep] if dep in df_transformed else None

trn_xs, trn_y = xs_y(trn_df)
val_xs, val_y = xs_y(val_df)

m = DecisionTreeClassifier(max_leaf_nodes=4).fit(trn_xs, trn_y)


In [None]:
def draw_tree(t, df, size=10, ratio=0.6, precision=2):
    s = export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True, special_characters=True, rotate=False, precision=precision)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))

draw_tree(m, trn_xs, size=10)

In [None]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5)
rf.fit(trn_xs, trn_y)

In [None]:
mean_absolute_error(val_y, rf.predict(val_xs))


In [None]:
pd.DataFrame({'cols': trn_xs.columns, 'imp': rf.feature_importances_}).plot('cols', 'imp', 'barh')


In [None]:
# Assuming 'tst_df' is your raw test dataframe and you have a function 'proc_data' for preprocessing
proc_data(tst_df)

# Transform 'tst_df' using the same function 'xs_y' defined earlier
tst_xs, _ = xs_y(tst_df)

# Make predictions using the trained model 'm'
test_predictions = m.predict(tst_xs)

# If you have actual target values for 'test.csv', you can evaluate the model
# For example, if 'actuals' is a Series with the true values for the test set:
# actuals = pd.read_csv('path_to/actuals.csv')['Survived']
# print(mean_absolute_error(actuals, test_predictions))

# If you need to prepare a submission file:
tst_df['Predicted'] = test_predictions
submission = tst_df[['PassengerId', 'Predicted']]
submission.to_csv('submission.csv', index=False)

In [None]:
submission_df = pd.read_csv('submission.csv')


In [None]:
# Merge the test data with the predictions
merged_df = tst_df.merge(submission_df, on='PassengerId')
print(merged_df.sample())
# Plot the distribution of fare prices for survivors and non-survivors
sns.histplot(data=merged_df, x='Sex', hue='Predicted_x', kde=True, element='step')
plt.title('Fare Distribution for Predicted Survival')
plt.xlabel('Sex')
plt.ylabel('Density')
plt.show()
