In [1]:
import pandas as pd
from keras.models import load_model
import joblib

2024-09-03 16:41:45.870310: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def align_columns(df, model_columns):
    """
    Align the DataFrame's columns with the model's expected columns.
    Add missing columns with a default value of 0 and ensure the correct order of columns.
    """
    # Identify missing columns that need to be added
    missing_cols = set(model_columns) - set(df.columns)

    # Add missing columns in the correct order with a default value of 0
    for col in model_columns:
        if col not in df.columns:
            df.insert(model_columns.index(col), col, 0)

    # Reorder the DataFrame to match the model's expected column order
    df = df[model_columns]

    return df


def make_predictions(file_path, model, scaler, model_columns):
    """
    Load data, align columns, scale data, make predictions, and return DataFrame with predictions.
    """
    # Load the data from CSV
    data = pd.read_csv(file_path, compression='zip')

    # Align the data columns with the model's expected columns
    data_aligned = align_columns(data, model_columns)

    # Scale the data
    data_scaled = scaler.transform(data_aligned)

    # Make predictions
    predictions = model.predict(data_scaled)

    # Convert predictions to binary classes
    predictions_class = (predictions > 0.5).astype("int32")

    # Add predictions to DataFrame
    data_aligned['Predicted_Outcome'] = predictions_class

    # Return the DataFrame with predictions
    return data_aligned


In [3]:
preprocessed_df = pd.read_csv('../data/processed/processed_data.csv.zip', compression='zip')
preprocessed_df.drop(columns=['outcome'], inplace=True)
columns = preprocessed_df.columns.to_list()

In [4]:
# Load the saved model
model = load_model('../models/red_hat_model.keras')

# Load the scaler
scaler = joblib.load('../models/standard_scaler.pkl')

# Model columns
model_columns = columns

# Test data file path
file_path = '../data/interim/testing_data.csv.zip'

predicted_data = make_predictions(file_path=file_path, model=model, scaler=scaler, model_columns=model_columns)



In [5]:
pd.set_option('display.max_columns', None)
predicted_data.sample(20)

Unnamed: 0,char_1,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,activity_type_labeled,group_1_labeled,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_2_type 2,char_2_type 3,char_3_type 10,char_3_type 11,char_3_type 12,char_3_type 13,char_3_type 14,char_3_type 15,char_3_type 16,char_3_type 17,char_3_type 18,char_3_type 19,char_3_type 2,char_3_type 20,char_3_type 21,char_3_type 22,char_3_type 23,char_3_type 24,char_3_type 25,char_3_type 26,char_3_type 27,char_3_type 28,char_3_type 29,char_3_type 3,char_3_type 30,char_3_type 31,char_3_type 32,char_3_type 33,char_3_type 34,char_3_type 35,char_3_type 36,char_3_type 37,char_3_type 38,char_3_type 39,char_3_type 4,char_3_type 40,char_3_type 41,char_3_type 42,char_3_type 44,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 10,char_4_type 11,char_4_type 12,char_4_type 13,char_4_type 14,char_4_type 15,char_4_type 16,char_4_type 17,char_4_type 18,char_4_type 19,char_4_type 2,char_4_type 20,char_4_type 21,char_4_type 22,char_4_type 23,char_4_type 24,char_4_type 25,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_4_type 7,char_4_type 8,char_4_type 9,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_5_type 7,char_5_type 8,char_5_type 9,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_6_type 6,char_6_type 7,char_7_type 10,char_7_type 11,char_7_type 12,char_7_type 13,char_7_type 14,char_7_type 15,char_7_type 16,char_7_type 17,char_7_type 18,char_7_type 19,char_7_type 2,char_7_type 20,char_7_type 21,char_7_type 22,char_7_type 23,char_7_type 24,char_7_type 25,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_7_type 9,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,activity_day_of_week,activity_month,activity_year,day_of_week,month,year,Predicted_Outcome
193301,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,0,1,0,1,1,89,2422,479,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,1,2023,4,11,2022,1
417828,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,500,6240,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,4,4,2023,4,6,2022,0
439498,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,72,2685,10908,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,2,8,2022,5,8,2022,1
25889,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,87,0,10062,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4,1,2023,1,6,2022,1
443701,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,83,0,11239,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,11,2022,3,11,2021,1
127708,1,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,32,0,2024,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,7,2023,2,9,2020,0
463299,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76,0,274,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,3,10,2022,1,6,2022,1
476259,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,62,2318,7554,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,5,10,2022,4,10,2022,1
341400,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,77,0,5499,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2,10,2022,1,3,2022,1
214122,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,70,0,9960,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,8,2023,0,8,2023,1


In [6]:
predicted_data.to_numpy()

array([[   1,    1,    1, ...,    7, 2022,    0],
       [   1,    1,    1, ...,    7, 2022,    0],
       [   1,    1,    1, ...,   10, 2022,    1],
       ...,
       [   1,    0,    0, ...,    3, 2022,    0],
       [   1,    0,    0, ...,    3, 2022,    0],
       [   1,    0,    0, ...,    3, 2022,    0]])