In [8]:
import pandas as pd
import numpy as np
import pickle

from scipy.stats import chi2_contingency, f_oneway, pointbiserialr

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.metrics import precision_score, classification_report, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

In [9]:
df = pd.read_csv('balance.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16426 entries, 0 to 16425
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            16426 non-null  int64  
 1   type            16426 non-null  object 
 2   amount          16426 non-null  float64
 3   nameOrig        16426 non-null  object 
 4   oldbalanceOrg   16426 non-null  float64
 5   newbalanceOrig  16426 non-null  float64
 6   nameDest        16426 non-null  object 
 7   oldbalanceDest  16426 non-null  float64
 8   newbalanceDest  16426 non-null  float64
 9   isFlaggedFraud  16426 non-null  int64  
 10  isFraud         16426 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 1.4+ MB


In [11]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
0,163,CASH_OUT,66061.94,C1444177449,0.0,0.0,C1433830539,153788.33,219850.28,0,0
1,378,PAYMENT,14598.76,C638641690,10779.0,0.0,M786508240,0.0,0.0,0,0
2,333,PAYMENT,10767.1,C1473859208,0.0,0.0,M204648638,0.0,0.0,0,0
3,307,PAYMENT,9342.7,C1660884816,180485.84,171143.14,M182225889,0.0,0.0,0,0
4,334,PAYMENT,6553.42,C843452443,20099.0,13545.58,M1268307159,0.0,0.0,0,0


In [12]:
df['isFraud'].value_counts()

0    8213
1    8213
Name: isFraud, dtype: int64

In [13]:
df_majority = df[df['isFraud'] == 0]  # Non-fraud transactions
df_minority = df[df['isFraud'] == 1]  # Fraud transactions

# Undersample the majority class
df_majority_undersampled = resample(df_majority, 
                                    replace=False,  # No replacement (random selection)
                                    n_samples=len(df_minority),  # Match minority class count
                                    random_state=42)  # For reproducibility

# Combine undersampled majority class with minority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify new class distribution
df_undersampled['isFraud'].value_counts()


1    8213
0    8213
Name: isFraud, dtype: int64

# Split Data

In [14]:
X = df_undersampled.drop(columns=['isFraud'])
y = df_undersampled['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

Training set size: (13140, 10)
Testing set size: (3286, 10)


In [15]:
num_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
cat_cols = ['step','type','nameOrig','nameDest']

In [16]:
# get the categorical features
p_values = []
results = []

for feature in cat_cols:
    # Create a contingency table (cross-tabulation between target and feature)
    contingency_table = pd.crosstab(X_train[feature], y_train)
    
    # Perform chi-squared test
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
    
    p_values.append(p_value)
    
    # Interpret the result based on p-value
    if p_value < 0.05:
        results.append(f'{feature} is correlated with isFraud')
    else:
        results.append(f'{feature} is not correlated with isFraud')

# Display results
correlation_results = pd.DataFrame({
    'Feature': cat_cols,
    'P-Value': p_values,
    'Interpretation': results
})

correlation_results

Unnamed: 0,Feature,P-Value,Interpretation
0,step,0.0,step is correlated with isFraud
1,type,0.0,type is correlated with isFraud
2,nameOrig,0.495898,nameOrig is not correlated with isFraud
3,nameDest,0.428791,nameDest is not correlated with isFraud


In [17]:
p_values = []
results = []

# Point Biserial Correlation (for binary categorical variables)
for feature in num_cols:
    corr, p_value = pointbiserialr(X_train[feature], y_train)  # Compute correlation

    p_values.append(p_value)
    
    # Interpret result
    if p_value < 0.05:
        results.append(f'{feature} is correlated with isFraud')
    else:
        results.append(f'{feature} is not correlated with isFraud')

# Display results
correlation_results = pd.DataFrame({
    'Feature': num_cols,
    'P-Value': p_values,
    'Interpretation': results
})

correlation_results

Unnamed: 0,Feature,P-Value,Interpretation
0,amount,0.0,amount is correlated with isFraud
1,oldbalanceOrg,7.191392999999999e-48,oldbalanceOrg is correlated with isFraud
2,newbalanceOrig,8.026305e-51,newbalanceOrig is correlated with isFraud
3,oldbalanceDest,3.229188e-24,oldbalanceDest is correlated with isFraud
4,newbalanceDest,0.7386456,newbalanceDest is not correlated with isFraud


### Features Selection

In [18]:
select_num_cols = ['step','amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest']
select_cat_cols = ['type']

# Pipeline Creation

## Preprocessing

In [19]:
onehot_encoder = OneHotEncoder()
robust_scaler = RobustScaler()
# create a preprocessing pipeline, using column transformer
preprocessing = ColumnTransformer(
    transformers=[
        # encode the low cardinality features using onehot encoder
        ('onehot', onehot_encoder, select_cat_cols),
        # scale the numerical column using standard scaler
        ('num', robust_scaler, select_num_cols)
    ],
    # as for the features we are not selected, we will drop it
    remainder='drop'
)

In [20]:
# checking the preprocessing if it's working
X_train_preprocess = preprocessing.fit_transform(X_train)
X_test_preprocess = preprocessing.transform(X_test)

## Model Random Forest

In [21]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",  # Automatically adjusts weights for fraud cases
    random_state=26
)

In [22]:
# creating pipeline with preprocessing and Random Forest classifier
pipeline_RandFor = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('classifier', rf_model)
])

In [23]:
# fit the pipeline
pipeline_RandFor.fit(X_train, y_train)

In [24]:
# Define a custom scorer for macro precision (for multiclass problems)
custom_precision_scorer = make_scorer(precision_score, average='macro')

cv_scores = cross_val_score(
    estimator=pipeline_RandFor,  # Your Random Forest pipeline
    X=X_train,                   # Training features
    y=y_train,            # Encoded training target
    cv=5,                         # Number of folds
    scoring=custom_precision_scorer  # Macro precision metric
)

# Print cross-validation results
print('Precision Score - All - Cross Validation  : ', cv_scores)
print('Precision Score - Mean - Cross Validation : ', cv_scores.mean())
print('Precision Score - Std - Cross Validation  : ', cv_scores.std())
print('Precision Score - Range of Test-Set       : ',
      (cv_scores.mean() - cv_scores.std()), '-', (cv_scores.mean() + cv_scores.std()))


Precision Score - All - Cross Validation  :  [0.99206094 0.99091187 0.99239135 0.98864381 0.98944234]
Precision Score - Mean - Cross Validation :  0.990690059876545
Precision Score - Std - Cross Validation  :  0.0014537418707562218
Precision Score - Range of Test-Set       :  0.9892363180057888 - 0.9921438017473012


## Model XGBoost

In [4]:
xgb_model = XGBClassifier(
    n_estimators=100,         # Number of trees, same as RF
    scale_pos_weight=10,      # Adjust based on class imbalance ratio (to be tuned)
    random_state=26,
    use_label_encoder=False,  # Avoids unnecessary warnings
    eval_metric="logloss"     # Standard evaluation metric for classification
)


In [25]:
# creating pipeline with preprocessing and Random Forest classifier
pipeline_XG = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('classifier', xgb_model)
])

In [26]:
# Define a custom scorer for macro precision (for multiclass problems)
custom_precision_scorer = make_scorer(precision_score, average='macro')

cv_scores = cross_val_score(
    estimator=pipeline_XG,  # Your Random Forest pipeline
    X=X_train,                   # Training features
    y=y_train,            # Encoded training target
    cv=5,                         # Number of folds
    scoring=custom_precision_scorer  # Macro precision metric
)

# Print cross-validation results
print('Precision Score - All - Cross Validation  : ', cv_scores)
print('Precision Score - Mean - Cross Validation : ', cv_scores.mean())
print('Precision Score - Std - Cross Validation  : ', cv_scores.std())
print('Precision Score - Range of Test-Set       : ',
      (cv_scores.mean() - cv_scores.std()), '-', (cv_scores.mean() + cv_scores.std()))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Precision Score - All - Cross Validation  :  [0.99397175 0.99055516 0.99201765 0.99131631 0.99170599]
Precision Score - Mean - Cross Validation :  0.9919133738765067
Precision Score - Std - Cross Validation  :  0.0011394739529610675
Precision Score - Range of Test-Set       :  0.9907738999235456 - 0.9930528478294678


Parameters: { "use_label_encoder" } are not used.



## Hyperparameter Tuning

In [28]:
# Define the hyperparameter grid for XGBoost
parameters = {
    'classifier__max_depth': [3, 6, 9],  # Depth of each tree
    'classifier__learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'classifier__n_estimators': [100, 300, 500],  # Number of trees
    'classifier__min_child_weight': [1, 3, 5],  # Minimum sum of instance weight (hessian)
    'classifier__gamma': [0, 0.1, 0.3],  # Minimum loss reduction
    'classifier__subsample': [0.8, 1.0],  # Fraction of samples used per tree
    'classifier__colsample_bytree': [0.8, 1.0]  # Fraction of features used per tree
}

# Using GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline_XG,  # Assuming you have an XGBoost pipeline
    param_grid=parameters,
    cv=5,  # 5-fold cross-validation is typically sufficient
    n_jobs=-1,  # Use all available cores
    scoring=custom_precision_scorer,  # Ensure this scorer is defined properly
    verbose=2
)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits


Parameters: { "use_label_encoder" } are not used.



In [29]:
# check the parameter and check the recall score
print("Best Parameters:", grid_search.best_params_)
print("Best Recall:", grid_search.best_score_)

Best Parameters: {'classifier__colsample_bytree': 0.8, 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 9, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 300, 'classifier__subsample': 1.0}
Best Recall: 0.9922147366102909


In [30]:
# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1648
           1       0.99      1.00      0.99      1638

    accuracy                           0.99      3286
   macro avg       0.99      0.99      0.99      3286
weighted avg       0.99      0.99      0.99      3286



In [31]:
# Export the model using pickle
with open('model_xgb.pkl','wb') as file:
    pickle.dump(best_model, file)

In [32]:
num_samples = 10  # Define how many new data points you want
sampled_data = df.sample(n=num_samples, random_state=42)  # Ensuring reproducibility

# Create the new_data dictionary with randomly sampled values
new_data = {
    'step': sampled_data['step'].tolist(),
    'type': sampled_data['type'].tolist(),
    'amount': sampled_data['amount'].tolist(),
    'nameOrig': sampled_data['nameOrig'].tolist(),
    'oldbalanceOrg': sampled_data['oldbalanceOrg'].tolist(),
    'newbalanceOrig': sampled_data['newbalanceOrig'].tolist(),
    'nameDest': sampled_data['nameDest'].tolist(),
    'oldbalanceDest': sampled_data['oldbalanceDest'].tolist(),
    'newbalanceDest': sampled_data['newbalanceDest'].tolist(),
}


In [33]:
new_data_df = pd.DataFrame(new_data)
new_data_df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,496,TRANSFER,766151.45,C2014325113,766151.45,0.0,C432435900,0.0,0.0
1,365,TRANSFER,31315.51,C1174332140,31315.51,0.0,C1252332991,0.0,0.0
2,133,TRANSFER,31278.61,C1860933755,0.0,0.0,C1036237514,2891651.16,2922929.77
3,396,CASH_OUT,247496.02,C729076925,277153.61,29657.58,C1944739729,1071480.81,1318976.84
4,400,CASH_IN,392013.87,C1355524373,3376358.53,3768372.41,C496853610,2522431.71,2130417.84
5,186,PAYMENT,4122.96,C1323353801,0.0,0.0,M1214051062,0.0,0.0
6,106,CASH_OUT,1409248.64,C1173095546,1409248.64,0.0,C1935607545,914125.46,2323374.1
7,131,CASH_OUT,373278.27,C1942184645,3059.0,0.0,C844038853,0.0,373278.27
8,468,CASH_OUT,155926.42,C19001916,155926.42,0.0,C654239643,369481.23,525407.65
9,721,TRANSFER,23639.56,C594313973,23639.56,0.0,C1343815682,0.0,0.0


In [34]:
# make a prediction with new_data_df
prediction = best_model.predict(new_data_df)

# for loop to predict each rows
for i in prediction:
    if i == 0:
        print('This is not A Fraud')
    elif i == 1:
        print('This is A Fraud')

This is A Fraud
This is A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is not A Fraud
This is A Fraud
This is not A Fraud
This is A Fraud
This is A Fraud
