In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
import joblib
from scipy.stats import skew, sem, t
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc,roc_auc_score
import xgboost as xgb
import warnings
import pickle
from datetime import datetime

In [48]:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Load data

In [49]:
train = pd.read_csv('../data/titanic_train.csv')
test = pd.read_csv('../data/titanic_test.csv')

In [50]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [51]:
df = test

In [52]:
df.shape

(418, 11)

# Drop unused features

In [53]:
# Droped features that were not used in the final version of the model
df.drop(['Embarked','Ticket'], axis=1, inplace=True)

# Missing data imputation

In [54]:
# Age (20% of data)
median_age_Pclass1 = df[df['Pclass']==1]['Age'].median()
median_age_Pclass2 = df[df['Pclass']==2]['Age'].median()
median_age_Pclass3 = df[df['Pclass']==3]['Age'].median()

In [55]:
def impute_age(data):
    """
    Function to impute missing 'Age' values based on 'Pclass'.

    This function imputes missing 'Age' values using the median age
    for each 'Pclass' (passenger class):
    - Pclass 1: Uses median_age_Pclass1
    - Pclass 2: Uses median_age_Pclass2
    - Pclass 3: Uses median_age_Pclass3
    If the 'Age' is not missing, it returns the original value.

    Parameters:
    data (Series): A row of the Titanic dataset with columns 'Age' and 'Pclass'.

    Returns:
    float: The imputed or original 'Age' value.
    """
    
    Age = data['Age']
    Pclass = data['Pclass']
    
    if pd.isnull(Age):
        if Pclass == 1:
            return median_age_Pclass1
        elif Pclass == 2:
            return median_age_Pclass2
        else:
            return median_age_Pclass3
    else:
        return Age

In [56]:
def impute_missing_values_titanic(data):
    """
    Imputes missing values in 'Age', 'Cabin', and other columns.
    
    - 'Age' is filled using median values by 'Pclass'.
    - 'Cabin' is filled with 'U' for unknown.
    - Any remaining missing values are dropped.

    Parameters:
    data (DataFrame): Titanic dataset.

    Returns:
    DataFrame: Dataset with missing values handled.
    """
    
    data['Age'] = data.apply(impute_age, axis=1)
    data['Cabin'] = data['Cabin'].fillna('U')
    data.dropna(inplace=True)

    return data

In [57]:
df = impute_missing_values_titanic(df)

# Feature Engineering

In [58]:
def generate_features(data):
    """
    Function to generate additional features for the Titanic dataset.
    
    This function creates new features such as 'Title', 'FamilySize', and 'Deck' by transforming 
    existing columns. It extracts 'Title' from 'Name', computes 'FamilySize' using 'SibSp' and 
    'Parch', and extracts the deck level from 'Cabin'. Additionally, unnecessary columns are 
    dropped after feature extraction.
    
    Parameters:
    data (DataFrame): The Titanic dataset.
    
    Returns:
    DataFrame: The modified dataset with newly generated features and redundant columns removed.
    """
    
    # Extract 'Title' from 'Name' column
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

    # Create 'FamilySize' from 'SibSp' + 'Parch' + 1 (including the individual)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

    # Extract the first letter from 'Cabin' column to create 'Deck' feature
    # If 'Cabin' is missing, the deck is assigned as 'U' for unknown
    data['Deck'] = data['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'U')

    # Drop original columns
    data.drop(['SibSp','Parch','Cabin','Name'], axis=1, inplace=True)

    return data

In [59]:
df = generate_features(df)

# Process Categorical features

In [60]:
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

# Ordinal Encoding

In [61]:
#Mappings for configs

title_mapping = {"Master":1, "Miss":2, "Mr":3, "Mrs":4, "Dr":5, "Rev":6}

deck_mapping =  {
    'A': 1,  
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,  
    'T': 8,  
    'U': 0   
}

In [62]:
def ordinal_encoding(data):
    """
    Encodes 'Title' and 'Deck' columns using predefined mappings and drops the originals.

    Parameters:
    data : pandas.DataFrame
        Input DataFrame with 'Title' and 'Deck' columns.

    Returns:
    pandas.DataFrame
        DataFrame with encoded 'Title' and 'Deck' columns as '_encoded'.
    """
    mappings = {
        'Title': title_mapping,
        'Deck': deck_mapping
    }

    ordinal_columns = ['Title', 'Deck']

    for column in ordinal_columns:
        data[column + '_encoded'] = data[column].map(lambda x: mappings[column].get(x, 0))
        data.drop(column, axis=1, inplace=True)

    return data

In [63]:
df = ordinal_encoding(df)

# Processing Continuos Variables

In [64]:
def log_transform_fare(data):
    """
    Log-transform the 'Fare' column and drop the original.
    
    Parameters:
    data (DataFrame): Titanic dataset.

    Returns:
    DataFrame: Dataset with 'Fare_log' and without 'Fare'.
    """
    data['Fare_log'] = np.log1p(data['Fare'])
    data.drop(['Fare'], axis=1, inplace=True)
    return data

In [65]:
df = log_transform_fare(df)

# Scale Age and Fare_log

In [68]:
def scale_continuous_features(data):
    """
    Scales the 'Age' and 'Fare_log' columns using a pre-fitted scaler.

    Parameters:
    data : pandas.DataFrame
        Input data containing the columns to scale.

    Returns:
    pandas.DataFrame
        DataFrame with scaled 'Age' and 'Fare_log' columns.
    """
    columns_to_scale = ['Age', 'Fare_log']
    scaler_loaded = joblib.load('../models/scaler.pkl')
    data[columns_to_scale] = scaler_loaded.transform(data[columns_to_scale])

    return data

In [69]:
df = scale_continuous_features(df)

# Preprocess function

# Forecast

In [70]:
df.columns

Index(['PassengerId', 'Pclass', 'Age', 'FamilySize', 'Sex_male',
       'Title_encoded', 'Deck_encoded', 'Fare_log'],
      dtype='object')

In [71]:
def generate_predictions(data):   
    # Get from config
    best_cutoff_threshold = 0.4012526976185753
    
    # Load model
    with open('../models/titanic_RandomForestClassifier_full.pkl', 'rb') as file:
        optimized_model_full = pickle.load(file)

    # Get the predicted probabilities
    y_pred_prob = optimized_model_full.predict_proba(data)[:, 1]  

    # Apply the custom threshold
    y_pred_custom = (y_pred_prob >= best_cutoff_threshold).astype(int)

    return y_pred_custom

In [75]:
def generate_predictions(data):
    """
    Generates predictions using a pre-trained Random Forest model and appends the results to a DataFrame.

    This function loads the pre-trained model, extracts the relevant model features from the input data,
    applies a custom probability threshold for classification, and generates predictions. The results are
    stored in a DataFrame along with 'PassengerId' and the current run date.

    If this function were running in a production environment (e.g., connected to a table in S3 or a database), 
    the results would need to be appended to an existing table rather than overwriting it. In such a scenario, 
    the function could be adapted to upload the results to S3 or a database by connecting to the appropriate 
    data storage service and appending the new predictions.

    Parameters:
    -----------
    data : pandas.DataFrame
        The input DataFrame containing the features used for making predictions.
    
    Returns:
    --------
    pandas.DataFrame
        A DataFrame with three columns:
        - 'PassengerId': The ID of each passenger.
        - 'Prediction': The predicted survival outcome based on the custom threshold.
        - 'RunDate': The date the pipeline was run.
    """

    # Get from config
    model_features = ['Pclass', 'Age', 'FamilySize', 'Sex_male',
       'Title_encoded', 'Deck_encoded', 'Fare_log']
    
    passenger_ids = data['PassengerId']

    best_cutoff_threshold = 0.4012526976185753
    
    # Load model
    with open('../models/titanic_RandomForestClassifier_full.pkl', 'rb') as file:
        optimized_model_full = pickle.load(file)

    # Get the predicted probabilities
    y_pred_prob = optimized_model_full.predict_proba(data[model_features])[:, 1]  

    # Apply the custom threshold
    y_pred_custom = (y_pred_prob >= best_cutoff_threshold).astype(int)

    # Get the current date when the pipeline is run
    current_date = datetime.now().strftime('%Y-%m-%d')

    #Create a DataFrame for the output
    output_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Prediction': y_pred_custom,
    'RunDate': current_date
    })

    return output_df


In [76]:
# Drop passenger ID for inference
output = generate_predictions(df)
output

Unnamed: 0,PassengerId,Prediction,RunDate
0,892,1,2024-08-26
1,893,1,2024-08-26
2,894,1,2024-08-26
3,895,1,2024-08-26
4,896,1,2024-08-26
...,...,...,...
413,1305,1,2024-08-26
414,1306,1,2024-08-26
415,1307,1,2024-08-26
416,1308,1,2024-08-26
