In [1]:
# first draft of modeling workflow

# Import Libraries

In [2]:
import os
import sys
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import random 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
# get variables from .env file 
load_dotenv()
project_path = os.getenv('PROJECT_PATH')

# add path to load own functions from .py files in Scrips folder
sys.path.insert(0, project_path + '\Scripts')

from plotting import *
from preprocessing import *

# Import data

In [5]:
# import from df (just to check)
df = pd.read_csv('../data/files/df_model.csv')
df.head()

Unnamed: 0,client_id,target,difference_acc_dur,elec_1_mon_1_mean,elec_1_mon_2_mean,elec_1_mon_3_mean,elec_1_mon_4_mean,elec_1_mon_5_mean,elec_1_mon_6_mean,elec_1_mon_7_mean,...,risk_client_category,risk_acc_creation_year,risk_acc_creation_weekday,risk_elec_tarif_type_mode,risk_elec_tarif_type_count,gas_tarif_type_mode,energy_types,risk_counter_status_mode,risk_counter_code_count,risk_counter_number_count
0,0,0,4901,296.0,120.333333,436.428571,,,201.0,361.222222,...,1,1,0,1,1,,1,1,1,0
1,1,0,4913,408.5,549.0,771.666667,713.0,534.285714,514.5,465.5,...,1,2,1,1,1,,1,1,0,0
2,10,0,4921,,579.0,815.0,,974.75,920.333333,,...,1,1,1,1,1,,1,1,1,0
3,100,0,2664,0.0,2.0,0.0,,0.0,0.0,,...,1,1,1,1,1,,1,1,0,0
4,1000,0,1585,,541.0,,736.0,,644.0,,...,1,1,1,1,1,,1,1,0,0


In [6]:
# check dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135493 entries, 0 to 135492
Data columns (total 89 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   client_id                   135493 non-null  int64  
 1   target                      135493 non-null  int64  
 2   difference_acc_dur          135493 non-null  int64  
 3   elec_1_mon_1_mean           64923 non-null   float64
 4   elec_1_mon_2_mean           70163 non-null   float64
 5   elec_1_mon_3_mean           71835 non-null   float64
 6   elec_1_mon_4_mean           72023 non-null   float64
 7   elec_1_mon_5_mean           73860 non-null   float64
 8   elec_1_mon_6_mean           70891 non-null   float64
 9   elec_1_mon_7_mean           72693 non-null   float64
 10  elec_1_mon_8_mean           69999 non-null   float64
 11  elec_1_mon_9_mean           65520 non-null   float64
 12  elec_1_mon_10_mean          68189 non-null   float64
 13  elec_1_mon_11_

In [7]:
# categorical features
CAT_FEATURES = ['risk_region',
 'risk_district',
 'risk_client_category',
 'risk_acc_creation_year',
 'risk_acc_creation_weekday',
 'risk_elec_tarif_type_mode',
 'risk_elec_tarif_type_count',
 'gas_tarif_type_mode',
 'energy_types',
 'risk_counter_status_mode',
 'risk_counter_code_count',
 'risk_counter_number_count']

In [8]:
# select numerical features as all non cat features (minus client_id and target)
NUM_FEATURES = df.columns[~df.columns.isin(CAT_FEATURES + ['client_id', 'target'])]
print(NUM_FEATURES)

Index(['difference_acc_dur', 'elec_1_mon_1_mean', 'elec_1_mon_2_mean',
       'elec_1_mon_3_mean', 'elec_1_mon_4_mean', 'elec_1_mon_5_mean',
       'elec_1_mon_6_mean', 'elec_1_mon_7_mean', 'elec_1_mon_8_mean',
       'elec_1_mon_9_mean', 'elec_1_mon_10_mean', 'elec_1_mon_11_mean',
       'elec_1_mon_12_mean', 'elec_2_mon_1_mean', 'elec_2_mon_2_mean',
       'elec_2_mon_3_mean', 'elec_2_mon_4_mean', 'elec_2_mon_5_mean',
       'elec_2_mon_6_mean', 'elec_2_mon_7_mean', 'elec_2_mon_8_mean',
       'elec_2_mon_9_mean', 'elec_2_mon_10_mean', 'elec_2_mon_11_mean',
       'elec_2_mon_12_mean', 'elec_3_mon_1_mean', 'elec_3_mon_2_mean',
       'elec_3_mon_3_mean', 'elec_3_mon_4_mean', 'elec_3_mon_5_mean',
       'elec_3_mon_6_mean', 'elec_3_mon_7_mean', 'elec_3_mon_8_mean',
       'elec_3_mon_9_mean', 'elec_3_mon_10_mean', 'elec_3_mon_11_mean',
       'elec_3_mon_12_mean', 'elec_4_mon_1_mean', 'elec_4_mon_2_mean',
       'elec_4_mon_3_mean', 'elec_4_mon_4_mean', 'elec_4_mon_5_mean',
       'el

# Baseline models
* a simple model that can be used as comparison to the more complex models

In [9]:
df.risk_region.unique()

array([0, 1, 2], dtype=int64)

In [10]:
# define some baseline models

def model_prediction_never_fraud(data):
    """ Baseline model. Always predict 'genuine' = 0
    """
    return np.zeros(data.shape[0])

def model_prediction_always_fraud(data):
    """ Baseline model. Always predict 'fraud' = 1
    """
    return np.ones(data.shape[0])

def model_prediction_random_fraud(data):
    """ Baseline model. Randomly predict 
        'fraud' = 1 or 'genuine' = 0
    """
    return [random.randint(0 ,1) for _ in range(data.shape[0])]

def model_predictions_risky_region(data, risk_values)->list:
    """ Baseline model. For clients living in a risky region == 2 
    predict fraud(1), otherwise genuine (0)

    Args:
        data (_type_): _description_
        risk_values (_type_): _description_

    Returns:
        list: _description_
    """
    return [1 if client_risk == risk_values else 0 for client_risk in data]

In [11]:
# define data
X = df.risk_region
y = df.target

# Split into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (108394,)
X_test shape: (27099,)
y_train shape: (108394,)
y_test shape: (27099,)


In [13]:
# make predictions
train_prediction = model_prediction_random_fraud(X_train)
test_prediction = model_prediction_random_fraud(X_test)

# Evaluation

In [14]:
def model_evaluation(y_test, y_pred): 
    # quick and dirty evaluation output for now 
    print("------"*10)
    print("Confusion Matrix: \n", 
        confusion_matrix(y_test, y_pred))

    print("------"*10)
    print("Classification Report: \n", 
    classification_report(y_test, y_pred))

    print("------"*10)
    cr = classification_report(y_test, y_pred, output_dict=True)
    f1_score = cr['macro avg']['f1-score'] * 100
    print(f"F1_Score: {round(f1_score,0)}") 
    return f1_score

In [15]:
train_evaluation  = model_evaluation(y_train, train_prediction)

------------------------------------------------------------
Confusion Matrix: 
 [[51264 51077]
 [ 3024  3029]]
------------------------------------------------------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.50      0.65    102341
           1       0.06      0.50      0.10      6053

    accuracy                           0.50    108394
   macro avg       0.50      0.50      0.38    108394
weighted avg       0.89      0.50      0.62    108394

------------------------------------------------------------
F1_Score: 38.0


In [16]:
test_evaluation  = model_evaluation(y_test, test_prediction)

------------------------------------------------------------
Confusion Matrix: 
 [[12779 12807]
 [  745   768]]
------------------------------------------------------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.50      0.65     25586
           1       0.06      0.51      0.10      1513

    accuracy                           0.50     27099
   macro avg       0.50      0.50      0.38     27099
weighted avg       0.90      0.50      0.62     27099

------------------------------------------------------------
F1_Score: 38.0


Model Evaluation - Choice and interpretation of classification measures 

*   Fraud detection is a binary Classification task: fraud vs genuine.
*   Fraud cases occur muss less frequently (minority class) vs. genuine (majority class)
*   In inbalanced data sets accuracy is not a reliable evaluation metric. 
    (demonstrate why usein baseline models )

Other metrics:

---
Precision: 
*   Correctness = Quality of positive predictions 
*   TP/(TP+FP)
*   Proportion of fraud detections that were actually correct
*   does not account for the correct detection of the negative class (no fraud)

Recall:
*   Sensitivity = the models ability to detect positive events correctly 
*   TP/(TP+FN)
*   Proportion of actual fraud cases that could be correctly identified
*   does not take into account FP (false alarms)

Precision-recall tradeoff:
*   ideally both Precision and Recall of a model are high.
*   However ...

F1_score: 
*   harmonic mean of precision and recall scores (both scores contribute equally)
*   indicates reliability of the model 
*   preferred metric when both recall and precision need to be optimized (and esp in imbalanced data sets) 

Macro average:
*    mean of individual class scores (0: no fraud and 1: fraud)
*    the class we want to detect (fraud) occurs less freuently, but is important so the macro average is the best metric for overall model evaluation





In [17]:
# TODO: Compare all baseline models
# TODO: Discuss model performance

## Class imbalance
* can affect model performance when more features of the majority class are learned compared to the minorty class (bias)
* solutions:
1) Under-sampling
2) Over-sampling 