In [1]:
import pandas as pd
from pathlib import Path
from constants import NOMINAL_COLUMNS, DISCRETE_COLUMNS, CONTINOUS_COLUMNS, TEXT_COLUMNS, BINARY_COLUMNS
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy import stats
import utils

In [2]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

df = df.replace('?', np.nan)

df = utils.extract_features(df, "Ticket")

for col in CONTINOUS_COLUMNS:
    df[col] = pd.to_numeric(df[col], errors='coerce')

fill_values_nominal = {col: df[col].mode()[0] for col in NOMINAL_COLUMNS}
fill_values_discrete = {col: df[col].median() for col in DISCRETE_COLUMNS}
fill_values_continuous = {col: df[col].mean(skipna=True) for col in CONTINOUS_COLUMNS}


for col in NOMINAL_COLUMNS:
    df[col].fillna(fill_values_nominal[col], inplace=True)

for col in DISCRETE_COLUMNS:
    df[col].fillna(fill_values_discrete[col], inplace=True)
df[DISCRETE_COLUMNS] = df[DISCRETE_COLUMNS].astype(int)

for col in CONTINOUS_COLUMNS:
    df[col].fillna(fill_values_continuous[col], inplace=True)

df.drop(columns=TEXT_COLUMNS, inplace=True)

df[BINARY_COLUMNS] = df[BINARY_COLUMNS].astype(int)

outlier_info = {}
zscore_info = {}
for col in CONTINOUS_COLUMNS:
    # Calculate Z-score values for the column
    df[col + '_zscore'] = stats.zscore(df[col])

    # Assuming that outliers are indicated by absolute Z-scores greater than 3
    outlier_indices = df[abs(df[col + '_zscore']) > 3].index

    # Replace outliers with the median of the column
    mean_value = df[col].mean()
    outlier_info[col] = {'outlier_replacement': mean_value, 'outlier_indices': list(outlier_indices)}

    df.loc[outlier_indices, col] = mean_value

    # Drop the Z-score column as it's no longer needed
    df.drop(columns=[col + '_zscore'], inplace=True)

# OneHot Encoding for ML
onehot_encoders = {}
new_columns = []

for col in NOMINAL_COLUMNS:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # print("Type of OH encoder: ", type(encoder))
    new_data = encoder.fit_transform(df[col].to_numpy().reshape(-1, 1))

    new_columns.extend(encoder.get_feature_names_out([col]))

    new_df = pd.DataFrame(new_data, columns=encoder.get_feature_names_out([col]))
    df = pd.concat([df, new_df], axis=1)

    onehot_encoders[col] = encoder

df.drop(columns=NOMINAL_COLUMNS, inplace=True)

min_max_scaler_dict = {}
min_max_scaler = MinMaxScaler()
for col in df.columns:
    df[col] = min_max_scaler.fit_transform(df[[col]])
    min_max_scaler_dict[col] = min_max_scaler

y = df["Survived"]
X = df.drop(columns="Survived")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

xgb = XGBClassifier(max_depth=4, n_estimators=10)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)

y_test_pred = xgb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

Train Accuracy:  0.8626716604244694
Test Accuracy:  0.8444444444444444


In [3]:
from MLModel import MLModel

df_preprocess = pd.read_csv(data_path)

model_handler = MLModel()

model_handler.preprocessing_pipeline(df_preprocess)

fill_values_nominal.pkl does not exist
fill_values_discrete.pkl does not exist
fill_values_continuous.pkl does not exist
min_max_scaler_dict.pkl does not exist
onehot_encoders_dict.pkl does not exist


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Ticket_length,Sex_female,Sex_male,...,Ticket_23101283,Ticket_23101285,Ticket_23101286,Ticket_23101287,Ticket_23101288,Ticket_23101289,Ticket_23101290,Ticket_23101292,Ticket_23101293,Ticket_23101294
0,0.000000,0.0,1.0,0.2750,0.125,0.000000,0.043975,0.400000,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001124,1.0,0.0,0.4750,0.125,0.000000,0.432369,0.333333,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002247,1.0,1.0,0.3250,0.000,0.000000,0.048069,0.866667,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.003371,1.0,0.0,0.4375,0.125,0.000000,0.322078,0.200000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004494,0.0,1.0,0.4375,0.000,0.000000,0.048827,0.200000,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.995506,0.0,0.5,0.3375,0.000,0.000000,0.078852,0.200000,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,0.996629,1.0,0.0,0.2375,0.000,0.000000,0.181965,0.200000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,0.997753,0.0,1.0,0.3500,0.125,0.333333,0.142236,0.466667,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,0.998876,1.0,0.0,0.3250,0.000,0.000000,0.181965,0.200000,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
