In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
import networkx as nx

#sys.path.append('../prepro/')

# Import the variables from prepro_trans_smote
#from prepro_trans_smote import X_res, y_res




from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV


In [4]:
file_path = '../../data/transactions.pkl'

with open(file_path, 'rb') as file:
    data = pickle.load(file)

# Now `data` holds the contents of the loaded .pkl file
print(data)

        step  customer  age  gender  merchant  category    amount  fraud
0          0       210    4       2        30        12 -0.299276      0
1          0      2753    2       2        30        12  0.016067      0
2          0      2285    4       1        18        12 -0.098742      0
3          0      1650    3       2        30        12 -0.185275      0
4          0      3585    5       2        30        12 -0.019480      0
...      ...       ...  ...     ...       ...       ...       ...    ...
594638   179      1639    3       1        18        12 -0.155832      0
594639   179      3369    4       1        18        12  0.115256      0
594640   179       529    2       1        31         2 -0.138687      0
594641   179      1083    5       2        18        12 -0.210319      0
594642   179      3304    4       1        18        12 -0.098383      0

[594643 rows x 8 columns]


In [5]:
data = pd.read_pickle("../../data/transactions.pkl")

X = data.drop(['fraud'],axis=1)
y = data['fraud']

In [6]:
sm = SMOTE(random_state = 42)

In [7]:
X_res, y_res = sm.fit_resample(X, y)
y_res = pd.DataFrame(y_res)

In [8]:
y_res.head()

Unnamed: 0,fraud
0,0
1,0
2,0
3,0
4,0


In [19]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42,
                                verbose=1, class_weight="balanced")

# Reshape y_res if it's not already a 1D array
y_res = y_res.squeeze() 

classifier.fit(X_res, y_res)
y_pred = classifier.predict(X_res)

print("Classification Report for Random Forest Classifier: \n", classification_report(y_res, y_pred))
print("Confusion Matrix of Random Forest Classifier: \n", confusion_matrix(y_res,y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.0s finished


Classification Report for Random Forest Classifier: 
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    587443
           1       0.97      0.99      0.98    587443

    accuracy                           0.98   1174886
   macro avg       0.98      0.98      0.98   1174886
weighted avg       0.98      0.98      0.98   1174886

Confusion Matrix of Random Forest Classifier: 
 [[570824  16619]
 [  3787 583656]]


In [21]:
# Creating a graph
G = nx.from_pandas_edgelist(data, 'customer', 'merchant', create_using=nx.Graph())

# Calculating centrality measures
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)

# Adding these features back to your DataFrame
data['degree_centrality'] = data['customer'].map(degree_centrality)
data['closeness_centrality'] = data['customer'].map(closeness_centrality)
data['betweenness_centrality'] = data['customer'].map(betweenness_centrality)

In [22]:
X = data.drop(['fraud'], axis=1)
y = data['fraud']

# Assuming SMOTE and other preprocessing are applied as before
X_res, y_res = sm.fit_resample(X, y)


In [None]:


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [8, 10, 12],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight="balanced", verbose=1),
                           param_grid, cv=3, scoring='recall')
grid_search.fit(X_res, y_res)
print(grid_search.best_params_)

# Using the best estimator
rf_clf = grid_search.best_estimator_
y_pred = rf_clf.predict(X_res)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j