In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from lightgbm import LGBMClassifier
import os
import pickle
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

# Procesamiento

In [17]:

# Initialize label encoder for categorical data encoding
le = LabelEncoder()

# Read the preprocessed and balanced data for model training
data = pd.read_csv('./data/silver/data.csv')
y_train = data['fraude']  # Extract the target variable 'fraude'
X_train = data.drop('fraude', axis=1)  # Drop the target variable to leave only features

# Read the raw data for testing and validation
data_raw = pd.read_csv('./data/bronze/dataset.csv')
y_raw = data_raw['fraude']  # Extract the target variable 'fraude' from the raw data
X_raw = data_raw.drop('fraude', axis=1)  # Drop the target variable to leave only features

# Define categorical columns
categorical_columns = ['a', 'd', 'score', 'fraude', 'g', 'n', 'o', 'p']

# Identify and define continuous numerical columns
continuous_numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns.difference(categorical_columns)

# Convert categorical columns to category type and encode them
for column in [col for col in categorical_columns if col not in ['fraude']]:
    X_raw[column] = X_raw[column].astype('category').cat.codes

# Generate date-related features for the raw data as was done for the training data
# Convert 'fecha' column to datetime
X_raw['fecha'] = pd.to_datetime(X_raw['fecha'])

# Feature engineering
# Feature 1: Day of the week
X_raw['day_of_week'] = X_raw['fecha'].dt.dayofweek
categorical_columns.append('day_of_week')

# Feature 2: Part of the day, encoded as 1 (00-06h), 2 (06-12h), 3 (12-18h), 4 (18-24h)
X_raw['part_of_day'] = pd.cut(X_raw['fecha'].dt.hour, 
                              bins=[0, 6, 12, 18, 24], 
                              include_lowest=True, 
                              labels=[1, 2, 3, 4])
categorical_columns.append('part_of_day')

# Feature 3: Week of the year
X_raw['week_of_year'] = X_raw['fecha'].dt.isocalendar().week
categorical_columns.append('week_of_year')

# Feature 4: Is it a weekend? (1 for Saturday/Sunday, 0 otherwise)
X_raw['is_weekend'] = (X_raw['fecha'].dt.dayofweek >= 5).astype(int)
categorical_columns.append('is_weekend')

# Drop the original 'fecha' column if it's no longer needed
X_raw.drop('fecha', axis=1, inplace=True)

# Fill missing values in numerical columns with the mean of each column
for column in continuous_numerical_columns:
    try:
        X_raw[column] = X_raw[column].fillna(X_raw[column].mean())
    except:
        pass  # If an error occurs, pass to the next column

# Split raw data into testing and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42)

# Ensure that all numerical data is of float type for consistency
for col in X_train.columns:
    if pd.api.types.is_integer_dtype(X_train[col]) or pd.api.types.is_float_dtype(X_train[col]):
        X_train[col] = X_train[col].astype(float)

for col in X_test.columns:
    if pd.api.types.is_integer_dtype(X_test[col]) or pd.api.types.is_float_dtype(X_test[col]):
        X_test[col] = X_test[col].astype(float)

# Define numerical and categorical columns after type conversion
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardize numerical features
        ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical features
    ])

# Fit the preprocessor on the training data and transform both the training and testing data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)
y_train_encoded = y_train
y_test_encoded = y_test

# Modelado

In [18]:
model_results_encoded = {}

## Regresion Logistica

In [19]:
# File path for the logistic regression model
pickle_file_logit = './models/logreg_model.pkl'

# Check if the logistic regression model exists; if so, load it; if not, train and save it
if os.path.isfile(pickle_file_logit):
    with open(pickle_file_logit, 'rb') as file:
        best_logit_model = pickle.load(file)
else:
    # Define grid search parameters for logistic regression
    param_grid_logit = {
        'C': [0.1, 1.0, 10.0],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2']
    }
    
    # Initialize grid search for logistic regression
    grid_search_logit = GridSearchCV(
        estimator=LogisticRegression(max_iter=1000),
        param_grid=param_grid_logit,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=3
    )
    
    # Execute grid search
    grid_search_logit.fit(X_train_encoded, y_train_encoded)
    
    # Save the best logistic regression model
    best_logit_model = grid_search_logit.best_estimator_
    
    # Save the model to a pickle file
    with open(pickle_file_logit, 'wb') as file:
        pickle.dump(best_logit_model, file)

# Make predictions and evaluate the logistic regression model
y_pred_logit = best_logit_model.predict(X_test_encoded)
model_results_encoded['Logistic Regression'] = {
    'Classification Report': classification_report(y_test_encoded, y_pred_logit),
    'ROC AUC Score': roc_auc_score(y_test_encoded, best_logit_model.predict_proba(X_test_encoded)[:, 1])
}


## Random Forest

In [20]:
# File path for the random forest model
pickle_file_rf = './models/random_forest_model.pkl'

# Check if the random forest model exists; if so, load it; if not, train and save it
if os.path.isfile(pickle_file_rf):
    with open(pickle_file_rf, 'rb') as file:
        best_rf_model = pickle.load(file)
else:
    # Define grid search parameters for random forest
    param_grid_rf = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    # Initialize grid search for random forest
    grid_search_rf = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid_rf,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=3
    )
    
    # Execute grid search
    grid_search_rf.fit(X_train_encoded, y_train_encoded)
    
    # Save the best random forest model
    best_rf_model = grid_search_rf.best_estimator_
    
    # Save the model to a pickle file
    with open(pickle_file_rf, 'wb') as file:
        pickle.dump(best_rf_model, file)

# Make predictions and evaluate the random forest model
y_pred_rf = best_rf_model.predict(X_test_encoded)
model_results_encoded['Random Forest'] = {
    'Classification Report': classification_report(y_test_encoded, y_pred_rf),
    'ROC AUC Score': roc_auc_score(y_test_encoded, best_rf_model.predict_proba(X_test_encoded)[:, 1])
}


## SVM

In [21]:
# File path for the SVM model
pickle_file_svm = './models/svm_model.pkl'

# Check if the SVM model exists; if so, load it; if not, train and save it
if os.path.isfile(pickle_file_svm):
    with open(pickle_file_svm, 'rb') as file:
        svm_model = pickle.load(file)
else:
    # Initialize the SVM with specified hyperparameters
    svm_model = SVC(
        kernel='linear',
        C=0.1,
        cache_size=700,
        tol=1e-3,
        probability=True,
        random_state=42
    )

    # Train the SVM
    svm_model.fit(X_train_encoded, y_train_encoded)

    # Save the SVM model to a pickle file
    with open(pickle_file_svm, 'wb') as file:
        pickle.dump(svm_model, file)

# Make predictions and evaluate the SVM model
y_pred_svm = svm_model.predict(X_test_encoded)
model_results_encoded['Support Vector Machine'] = {
    'Classification Report': classification_report(y_test_encoded, y_pred_svm),
    'ROC AUC Score': roc_auc_score(y_test_encoded, svm_model.predict_proba(X_test_encoded)[:, 1])
}


## Artificial Neural Networks

In [22]:
# File path for the ANN model
pickle_file_ann = './models/ann_model.pkl'

# Check if the ANN model exists; if so, load it; if not, train and save it
if os.path.isfile(pickle_file_ann):
    with open(pickle_file_ann, 'rb') as file:
        ann_model = pickle.load(file)
else:
    # Initialize the ANN with specified hyperparameters
    ann_model = MLPClassifier(alpha=0.01, random_state=42, max_iter=1000)

    # Train the ANN
    ann_model.fit(X_train_encoded, y_train_encoded)

    # Save the ANN model to a pickle file
    with open(pickle_file_ann, 'wb') as file:
        pickle.dump(ann_model, file)

# Make predictions and evaluate the ANN model
y_pred_ann = ann_model.predict(X_test_encoded)
model_results_encoded['Artificial Neural Network'] = {
    'Classification Report': classification_report(y_test_encoded, y_pred_ann),
    'ROC AUC Score': roc_auc_score(y_test_encoded, ann_model.predict_proba(X_test_encoded)[:, 1])
}

## LGBM

In [23]:
# File path for the LightGBM model
pickle_file_lgbm = './models/lgbm_model.pkl'
max_depth = 7

# Define grid search parameters for LightGBM
param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_data_in_leaf': [10, 15, 20],
    'num_leaves': [2**i - 1 for i in range(max_depth, max_depth+1)],
    'reg_lambda': [0.01, 0.1, 1],
    'max_depth': [6, 7, 8],
}

# Check if the LightGBM model exists; if so, load it; if not, train and save it
if os.path.isfile(pickle_file_lgbm):
    with open(pickle_file_lgbm, 'rb') as file:
        best_lgbm_model = pickle.load(file)
else:
    # Initialize LightGBM with forced row-wise computation
    lgbm_classifier = LGBMClassifier(force_row_wise=True)

    # Initialize grid search for LightGBM
    grid_search_lgbm = GridSearchCV(
        estimator=lgbm_classifier,
        param_grid=param_grid_lgbm,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=3
    )

    # Execute grid search
    grid_search_lgbm.fit(X_train_encoded, y_train_encoded)

    # Save the best LightGBM model
    best_lgbm_model = grid_search_lgbm.best_estimator_

    # Save the model to a pickle file
    with open(pickle_file_lgbm, 'wb') as file:
        pickle.dump(best_lgbm_model, file)

# Make predictions and evaluate the LightGBM model
y_pred_lgbm = best_lgbm_model.predict(X_test_encoded)
model_results_encoded['LightGBM'] = {
    'Classification Report': classification_report(y_test_encoded, y_pred_lgbm),
    'ROC AUC Score': roc_auc_score(y_test_encoded, best_lgbm_model.predict_proba(X_test_encoded)[:, 1])
}


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[LightGBM] [Info] Number of positive: 76786, number of negative: 76786
[LightGBM] [Info] Total Bins 1800
[LightGBM] [Info] Number of data points in the train set: 153572, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


## Results

In [25]:
# Print classification reports and ROC AUC scores for each model
for model_name, results in model_results_encoded.items():
    print(f"Model: {model_name}")
    print("Classification Report:")
    print(results['Classification Report'])
    print(f"ROC AUC Score: {results['ROC AUC Score']}\n")

Modelo: Logistic Regression
Informe de clasificación:
              precision    recall  f1-score   support

           0       0.98      0.76      0.86    114024
           1       0.13      0.68      0.22      5976

    accuracy                           0.76    120000
   macro avg       0.55      0.72      0.54    120000
weighted avg       0.94      0.76      0.82    120000

ROC AUC Score: 0.7848785662775519

Modelo: Random Forest
Informe de clasificación:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95    114024
           1       0.30      0.60      0.40      5976

    accuracy                           0.91    120000
   macro avg       0.64      0.76      0.68    120000
weighted avg       0.94      0.91      0.92    120000

ROC AUC Score: 0.8705662018146723

Modelo: Support Vector Machine
Informe de clasificación:
              precision    recall  f1-score   support

           0       0.98      0.77      0.86    114024
        

## Ensemble
La precision y el recall se vuelve un problema en nuestro caso, con algunos modelos con buen recall pero mala precision y algunos con una mejor precision pero un recall terrible. Teniendo esto en cuenta, podriamos llegar a mejorar los resultados en general con ayuda de un ensemble model, que nos permita tener varios modelos funcionando a la vez y tomando decisiones sobre estos. Esto nos podria ayudar a suabisar la diferencia entre la precision y el recall que tenemos en este momento.

In [31]:
# Define a list of (name, model) tuples for the models to be included in the ensemble
estimators = [
    ('logistic', best_logit_model),
    ('random_forest', best_rf_model),
    ('lightgbm', best_lgbm_model)
]

# Create the ensemble model using a voting classifier with 'soft' voting
ensemble = VotingClassifier(estimators=estimators, voting='soft')

# Fit the ensemble model using the training data
ensemble.fit(X_train_encoded, y_train_encoded)

# Make predictions with the ensemble model on the test data
ensemble_predictions = ensemble.predict(X_test_encoded)

# Evaluate the ensemble model's accuracy on the test data
ensemble_score = ensemble.score(X_test_encoded, y_test_encoded)

# Save the trained ensemble model to a pickle file for later use or deployment
with open('./models/ensemble_model_pres.pkl', 'wb') as file:
    pickle.dump(ensemble, file)

# Calculate various performance metrics to evaluate the ensemble model
accuracy = accuracy_score(y_test_encoded, ensemble_predictions)
precision = precision_score(y_test_encoded, ensemble_predictions)
recall = recall_score(y_test_encoded, ensemble_predictions)
f1 = f1_score(y_test_encoded, ensemble_predictions)
roc_auc = roc_auc_score(y_test_encoded, ensemble.predict_proba(X_test_encoded)[:, 1])

# Print out the performance metrics for the ensemble model
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'ROC AUC Score: {roc_auc:.4f}')


Accuracy: 0.9036
Precision: 0.2854
Recall: 0.6223
F1-score: 0.3914
ROC AUC Score: 0.8728


# Analisis Final
Teniendo en cuenta la relacion entre la precision y el recall, nuestro modelo esta generando muchos falsos positivos, siendo el 80% de las predicciones, falsos positivos. Aun asi, el recall nos dice que en efecto de todas las transacciones marcadas como fraudulentas en el dataset, el 67% las logro identificar. Esto podria mejorarse despues de hacer un proceso de fine tunening adecuado, y apesar de tener un ROC AUC de 85%, el modelo todavia tiene mucho margen de mejora, en especial con el recall.

In [33]:
# Define a list of (name, model) tuples including the models for the ensemble: logistic regression, random forest, LightGBM, and ANN
estimators = [
    ('logistic', best_logit_model),
    ('random_forest', best_rf_model),
    ('lightgbm', best_lgbm_model),
    ('ann', ann_model)
]

# Create the ensemble model with the voting classifier using 'soft' voting
ensemble = VotingClassifier(estimators=estimators, voting='soft')

# Fit the ensemble model to the training data
ensemble.fit(X_train_encoded, y_train_encoded)

# Make predictions using the ensemble model on the test dataset
ensemble_predictions = ensemble.predict(X_test_encoded)

# Evaluate the accuracy of the ensemble model on the test data
ensemble_score = ensemble.score(X_test_encoded, y_test_encoded)

# Save the trained ensemble model to a pickle file for future use
with open('./models/ensemble_model_bal.pkl', 'wb') as file:
    pickle.dump(ensemble, file)

# Calculate several performance metrics for the ensemble model
accuracy = accuracy_score(y_test_encoded, ensemble_predictions)  # Overall accuracy
precision = precision_score(y_test_encoded, ensemble_predictions)  # Precision of positive predictions
recall = recall_score(y_test_encoded, ensemble_predictions)  # Recall (true positive rate)
f1 = f1_score(y_test_encoded, ensemble_predictions)  # Harmonic mean of precision and recall
roc_auc = roc_auc_score(y_test_encoded, ensemble.predict_proba(X_test_encoded)[:, 1])  # ROC-AUC score using probability estimates for the positive class

# Print out the calculated performance metrics for the ensemble model
print(f'Accuracy: {accuracy:.4f}')   # Print accuracy rounded to 4 decimal places
print(f'Precision: {precision:.4f}')  # Print precision rounded to 4 decimal places
print(f'Recall: {recall:.4f}')       # Print recall rounded to 4 decimal places
print(f'F1-score: {f1:.4f}')          # Print F1-score rounded to 4 decimal places
print(f'ROC AUC Score: {roc_auc:.4f}') # Print ROC AUC score rounded to 4 decimal places


Accuracy: 0.9421
Precision: 0.4182
Recall: 0.4145
F1-score: 0.4163
ROC AUC Score: 0.8730
