# Credit Card Fraud: Model With Principal Parameter.

## Previous Tasks

### Import Libraries

In [1]:
# Generic Libraries
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  RobustScaler
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML  

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import  f1_score, fbeta_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
# Define general path:
path_general = r'C:\TFM'
path_total = os.path.join(path_general,'01_total_models') 

In [3]:
# Model Libraries.

# Cross validation
from sklearn.model_selection import cross_val_score 

#------------- / Regresion Logistica /--------------
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

#--------------/ XGBoost /--------------------------
from xgboost import XGBClassifier
import xgboost as xgb

#-------------/ AdaBoost /--------------------------
from sklearn.ensemble import AdaBoostClassifier

#-------------/ CatBoost /--------------------------
from catboost import CatBoostClassifier

#------------/ Decission Tree /----------------------
from sklearn.tree import DecisionTreeClassifier

#------------/ Random Forest /-----------------------
from sklearn.ensemble import RandomForestClassifier

#-----------/   MLP /--------------------------------
from sklearn.neural_network import MLPClassifier

#------------/ KNN /----------------------------------
from sklearn.neighbors import KNeighborsClassifier

#------------/ Naive - Bayes /-------------------------
from sklearn.naive_bayes import GaussianNB


### Load Dataset

In [4]:
# Load dataset.
df = pd.read_csv('creditcard.csv')
df = df.drop("Time", axis = 1)

y= df["Class"]
X = df.drop("Class", axis = 1)
y.shape,X.shape

((284807,), (284807, 29))

In [5]:
# Separation of the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42,stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227845, 29), (56962, 29), (227845,), (56962,))

In [6]:
# Check dataset composition

print(" Fraudulent Count for Full data :  ",np.sum(y))
print(" Fraudulent Count for Train data : ",np.sum(y_train))
print(" Fraudulent Count for Test data :   ",np.sum(y_test))

 Fraudulent Count for Full data :   492
 Fraudulent Count for Train data :  394
 Fraudulent Count for Test data :    98


In [7]:
# Save the testing set for evaluation
X_test_saved = X_test.copy()
y_test_saved = y_test.copy()
print("Saved X_test & y_test")

Saved X_test & y_test


In [8]:
# As PCA is already performed on the dataset from V1 to V28 features, we are scaling only Amount field
scaler = RobustScaler()

# Scaling the train data
X_train[["Amount"]] = scaler.fit_transform(X_train[["Amount"]])

# Transforming the test data
X_test[["Amount"]] = scaler.transform(X_test[["Amount"]])

## 1.- Transformaciones de datos.

## Dataset Original

### Smote

In [9]:
# Import of specific libraries
from collections import Counter
from imblearn.over_sampling import SMOTE

# Initial situation
print('Original dataset shape %s' % Counter(y_train))

# Calculate OverSampling model
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_smote))

Original dataset shape Counter({0: 227451, 1: 394})
Resampled dataset shape Counter({0: 227451, 1: 227451})


### Adasyn

In [10]:
# Import of specific libraries
from imblearn.over_sampling import ADASYN

# Initial situation
print('Original dataset shape %s' % Counter(y_train))

# Calculate OverSampling model
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_adasyn))

Original dataset shape Counter({0: 227451, 1: 394})
Resampled dataset shape Counter({1: 227458, 0: 227451})


## Power Transformation

### Original

In [11]:
# - Apply : preprocessing.PowerTransformer(copy=False) to fit & transform the train & test data

from sklearn import metrics 
from sklearn import preprocessing


from sklearn.preprocessing import PowerTransformer

pt= preprocessing.PowerTransformer(method='yeo-johnson', copy=True)  # creates an instance of the PowerTransformer class.
pt.fit(X_train)

X_train_pt = pt.transform(X_train)
X_test_pt = pt.transform(X_test)

y_train_pt = y_train
y_test_pt = y_test

### Smote

In [12]:
# Import of specific libraries
from collections import Counter
from imblearn.over_sampling import SMOTE

# Initial situation
print('Original dataset shape %s' % Counter(y_train_pt))

# Calculate OverSampling model
smote = SMOTE(random_state=42)
X_train_smote_pt, y_train_smote_pt = smote.fit_resample(X_train_pt, y_train_pt)

print('Resampled dataset shape %s' % Counter(y_train_smote_pt))

Original dataset shape Counter({0: 227451, 1: 394})
Resampled dataset shape Counter({0: 227451, 1: 227451})


### Adasyn

In [13]:
# Import of specific libraries
from imblearn.over_sampling import ADASYN

# Initial situation
print('Original dataset shape %s' % Counter(y_train))

# Calculate OverSampling model
adasyn = ADASYN(random_state=42)
X_train_adasyn_pt, y_train_adasyn_pt = adasyn.fit_resample(X_train_pt, y_train_pt)

print('Resampled dataset shape %s' % Counter(y_train_adasyn_pt))

Original dataset shape Counter({0: 227451, 1: 394})
Resampled dataset shape Counter({1: 227459, 0: 227451})


### Load Model: Libraries and Functions.

In [14]:
# LOAD OF MODELS.
# perfom cross validation on the X_train & y_train 
from sklearn.model_selection import StratifiedKFold

# Initialize StratifiedKFold cross-validator
# perform cross validation
skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
#  Shuffle is False because we need a constant best model when we use GridSearchCV


In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

### Create dataset_list

In [16]:
# Original distribution
OR_origin = ['OR origin',X_train, y_train, X_test, y_test]
OR_smote =['OR smote',X_train_smote, y_train_smote, X_test, y_test]
OR_adasyn = ['OR adasyn', X_train_adasyn, y_train_adasyn, X_test, y_test]

# Power Transformation
PT_origin = ['PT origin',X_train_pt, y_train_pt, X_test_pt, y_test_pt]
PT_smote = ['PT smote',X_train_smote_pt, y_train_smote_pt, X_test_pt, y_test_pt ]
PT_adasyn = ['PT adasyn', X_train_adasyn_pt, y_train_adasyn_pt, X_test_pt, y_test_pt]

### Create models

In [17]:
model_list = ['regression_logistic', 'adaboost', 'xgboost', 'catboost', 'decision_tree', 'random_forest', 'mlp', 'knn']
parameters = [
    [0.1, 0.5, 1, 1.5, 2, 2.5, 3],      # For 'regression_logistic'
    [5, 7, 9],                          # For 'adaboost'
    [0.001, 0.01, 0.1, 0.5, 1, 3],      # For 'xgboost'
    [100, 200, 300, 400, 500, 600],      # For 'catboost'
    [1, 2, 3, 4, 5],                    # For 'decision_tree'
    [100, 200, 400, 600, 800, 1000, 1200],  # For 'random_forest'
    [(50,), (100,), (120,), (150,)],     # For 'mlp'
    [3, 5, 7]  #knn
    ]

In [19]:
distributions =[OR_origin, OR_smote, OR_adasyn, PT_origin, PT_smote, PT_adasyn]
complete_model = zip(model_list, parameters)
complete_model_list = list(complete_model )
complete_model_list

[('regression_logistic', [0.1, 0.5, 1]), ('knn', [3, 5])]

In [22]:
def gen_models(complete_model_list, distributions, save_directory_complete_model=None):
    
    # test if directory is None and add path
    if save_directory_complete_model is None:
        save_directory_complete_model = os.path.join(os.getcwd(), 'total')
        
    #Create folder if not exits
    os.makedirs(save_directory_complete_model, exist_ok=True)

    
    # 1.- Iterate model
    for model_name, param_values in complete_model_list:
        #print(f"Processing model: {model_name}")  
        
        resultados_totales = []  
        
        # 2.- Iterate over distributions:
        for distribution in distributions:
            try:
                # Unpack distribution
                name = distribution[0]  # Nombre de la distribución
                X_train, y_train, X_val, y_val = distribution[1:]

                #print(f"  Distribution: {name}")  # Log distribución

                # Verify if data is valid:
                if X_train is None or y_train is None or X_val is None or y_val is None:
                    print(f"    Skipping due to missing data in {name}")
                    continue
                
                # 3.- Iterate over params:
                for param in param_values:
                    #print(f"    Training {model_name} with parameter {param}")  # Log parámetro

                    # Inicializate model and parameters
                    model_instance = None
                    if model_name == 'regression_logistic':
                        model_instance = LogisticRegression(C=param)
                        parameter_name ='C='
                    elif model_name == 'adaboost':
                        #model_instance = AdaBoostClassifier(DecisionTreeClassifier(max_depth=param))
                        model_instance = AdaBoostClassifier(DecisionTreeClassifier(iterations=param))
                        #parameter_name = 'max_depth='
                        parameter_name = 'iterations='
                    elif model_name == 'xgboost':
                        model_instance = XGBClassifier(learning_rate=param)
                        parameter_name  = 'learning_rate='
                    elif model_name == 'catboost':
                        model_instance = CatBoostClassifier(iterations=param, verbose=0)
                        parameter_name  = 'iterations='
                    elif model_name == 'decision_tree':
                        model_instance = DecisionTreeClassifier(max_depth=param)
                        parameter_name  = 'max_depth='
                    elif model_name == 'random_forest':
                        model_instance = RandomForestClassifier(n_estimators=param)
                        parameter_name  = 'n_estimators'
                    elif model_name == 'mlp':
                        model_instance = MLPClassifier(hidden_layer_sizes=param)
                        parameter_name  = 'hidden_layer_sizes'
                    elif model_name == 'knn':
                        model_instance = KNeighborsClassifier(n_neighbors=param)
                        parameter_name  = 'n_neighbors'
                    else:
                        print(f"    Invalid model name: {model_name}")
                        continue

                    
                    # Train model
                    model_instance.fit(X_train, y_train)

                    # Get predictions
                    y_pred = model_instance.predict(X_val)

                    # Calculate metrics
                    roc_auc = roc_auc_score(y_val, y_pred)
                    accuracy = accuracy_score(y_val, y_pred)
                    precision = precision_score(y_val, y_pred)
                    recall = recall_score(y_val, y_pred)
                    f1 = f1_score(y_val, y_pred)
                    f2 = fbeta_score(y_val, y_pred, beta=2)
                    confusion = confusion_matrix(y_val, y_pred)

                    # Save results in DataFrame:
                    results_df = pd.DataFrame({
                        'Model': [model_name],
                        'Description': [name],
                        'Parameter':[parameter_name + str(param)],
                        'ROC-AUC': [roc_auc],
                        'Accuracy': [accuracy],
                        'Precision': [precision],
                        'Recall': [recall],
                        'F1 Score': [f1],
                        'F2 Score': [f2],
                        'Confusion Matrix': [confusion],
                    })

                    resultados_totales.append(results_df)

            except Exception as e:
                print(f"  Error processing distribution {name}: {str(e)}")
                continue

        # Save results:
        if resultados_totales:
            df_resultados_final = pd.concat(resultados_totales, ignore_index=True)
            save_path = os.path.join(save_directory_complete_model, f"{model_name}_total.csv")
            df_resultados_final.to_csv(save_path, index=False)
            display(HTML(f"<h2 style='text-align: center;font-size:60px;'> Modelo: {model_name}</h2>")) 
            display(df_resultados_final)
            print(f"\n\n\nResults for {model_name} saved to {save_path}")
        else:
            print(f"No results generated for {model_name}")



Unnamed: 0,Model,Description,Parameter,ROC-AUC,Accuracy,Precision,Recall,F1 Score,F2 Score,Confusion Matrix
0,random_forest,OR origin,n_estimators=100,0.908119,0.999596,0.941176,0.816327,0.874317,0.838574,"[[56859 5], [ 18 80]]"
1,random_forest,OR origin,n_estimators=200,0.913221,0.999614,0.94186,0.826531,0.880435,0.84728,"[[56859 5], [ 17 81]]"
2,random_forest,OR origin,n_estimators=300,0.908128,0.999614,0.952381,0.816327,0.879121,0.840336,"[[56860 4], [ 18 80]]"
3,random_forest,OR origin,n_estimators=400,0.913221,0.999614,0.94186,0.826531,0.880435,0.84728,"[[56859 5], [ 17 81]]"
4,random_forest,OR origin,n_estimators=500,0.913221,0.999614,0.94186,0.826531,0.880435,0.84728,"[[56859 5], [ 17 81]]"
5,random_forest,OR origin,n_estimators=600,0.908119,0.999596,0.941176,0.816327,0.874317,0.838574,"[[56859 5], [ 18 80]]"
6,random_forest,OR smote,n_estimators=100,0.908058,0.999473,0.869565,0.816327,0.842105,0.826446,"[[56852 12], [ 18 80]]"
7,random_forest,OR smote,n_estimators=200,0.913151,0.999473,0.861702,0.826531,0.84375,0.833333,"[[56851 13], [ 17 81]]"
8,random_forest,OR smote,n_estimators=300,0.908067,0.999491,0.879121,0.816327,0.846561,0.828157,"[[56853 11], [ 18 80]]"
9,random_forest,OR smote,n_estimators=400,0.902965,0.999473,0.877778,0.806122,0.840426,0.819502,"[[56853 11], [ 19 79]]"


Unnamed: 0,Model,Description,Parameter,ROC-AUC,Accuracy,Precision,Recall,F1 Score,F2 Score,Confusion Matrix
0,knn,OR origin,n_neighbors=3,0.908093,0.999544,0.909091,0.816327,0.860215,0.833333,"[[56856 8], [ 18 80]]"
1,knn,OR origin,n_neighbors=5,0.892787,0.999491,0.905882,0.785714,0.84153,0.807128,"[[56856 8], [ 21 77]]"
2,knn,OR origin,n_neighbors=7,0.877489,0.999456,0.91358,0.755102,0.826816,0.782241,"[[56857 7], [ 24 74]]"
3,knn,OR smote,n_neighbors=3,0.92793,0.998473,0.535032,0.857143,0.658824,0.765027,"[[56791 73], [ 14 84]]"
4,knn,OR smote,n_neighbors=5,0.937764,0.99777,0.427861,0.877551,0.575251,0.725126,"[[56749 115], [ 12 86]]"
5,knn,OR smote,n_neighbors=7,0.94752,0.99691,0.346457,0.897959,0.5,0.681115,"[[56698 166], [ 10 88]]"
6,knn,OR adasyn,n_neighbors=3,0.92793,0.998473,0.535032,0.857143,0.658824,0.765027,"[[56791 73], [ 14 84]]"
7,knn,OR adasyn,n_neighbors=5,0.937764,0.99777,0.427861,0.877551,0.575251,0.725126,"[[56749 115], [ 12 86]]"
8,knn,OR adasyn,n_neighbors=7,0.947502,0.996875,0.34375,0.897959,0.497175,0.679012,"[[56696 168], [ 10 88]]"
9,knn,PT origin,n_neighbors=3,0.918288,0.999561,0.901099,0.836735,0.867725,0.848861,"[[56855 9], [ 16 82]]"


Unnamed: 0,Model,Description,Parameter,ROC-AUC,Accuracy,Precision,Recall,F1 Score,F2 Score,Confusion Matrix
0,mlp,OR origin,"hidden_layer_sizes=(50,)",0.908058,0.999473,0.869565,0.816327,0.842105,0.826446,"[[56852 12],[ 18 80]]"
1,mlp,OR origin,"hidden_layer_sizes=(100,)",0.918218,0.999421,0.828283,0.836735,0.832487,0.835031,"[[56847 17],[ 16 82]]"
2,mlp,OR origin,"hidden_layer_sizes=(120,)",0.91316,0.999491,0.870968,0.826531,0.848168,0.835052,"[[56852 12],[ 17 81]]"
3,mlp,OR origin,"hidden_layer_sizes=(150,)",0.928343,0.999298,0.763636,0.857143,0.807692,0.836653,"[[56838 26],[ 14 84]]"
4,mlp,OR smote,"hidden_layer_sizes=(50,)",0.907864,0.999087,0.701754,0.816327,0.754717,0.790514,"[[56830 34],[ 18 80]]"
5,mlp,OR smote,"hidden_layer_sizes=(100,)",0.91813,0.999245,0.752294,0.836735,0.792271,0.818363,"[[56837 27],[ 16 82]]"
6,mlp,OR smote,"hidden_layer_sizes=(120,)",0.918104,0.999192,0.732143,0.836735,0.780952,0.813492,"[[56834 30],[ 16 82]]"
7,mlp,OR smote,"hidden_layer_sizes=(150,)",0.897783,0.999298,0.795918,0.795918,0.795918,0.795918,"[[56844 20],[ 20 78]]"
8,mlp,OR adasyn,"hidden_layer_sizes=(50,)",0.913037,0.999245,0.757009,0.826531,0.790244,0.811623,"[[56838 26],[ 17 81]]"
9,mlp,OR adasyn,"hidden_layer_sizes=(100,)",0.902771,0.999087,0.705357,0.806122,0.752381,0.78373,"[[56831 33],[ 19 79]]"
