In [1]:
import pandas as pd
import pickle
import seaborn as sns
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
#df = pd.read_csv("C:/Users/GRETA/Documents/GitHub/Projet_Classification_THBS/database/transactions.csv", index_col='transactionID')
df = pd.read_csv("C:/Users/GRETA/Documents/GitHub/Projet_Classification_THBS/database/credit_card_fraud_corr.csv", sep=";")
df.head()

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,0,1,PAYMENT,983964,C1231006815,170136,16029636,M1979787155,0,0,0
1,1,1,PAYMENT,186428,C1666544295,21249,1938472,M2044282225,0,0,0
2,2,1,TRANSFER,181,C1305486145,181,0,C553264065,0,0,1
3,3,1,CASH_OUT,181,C840083671,181,0,C38997010,21182,0,1
4,4,1,PAYMENT,1166814,C2048537720,41554,2988586,M1230701703,0,0,0


In [3]:
df['type'] = df['type'].replace({
    'CASH_IN': 1,
    'CASH_OUT': 2,
    'DEBIT': 3,
    'PAYMENT': 4,
    'TRANSFER': 5
})

colonne = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

def stringtofloat(dataframe, colonne):
    for item in colonne :
        dataframe[item] = dataframe[item].str.replace(',', '.').astype(float)

stringtofloat(df, colonne)

def replace_first_letter(value):
    if value.startswith('C'):
        return '1' + value[1:]
    elif value.startswith('M'):
        return '2' + value[1:]
    else:
        return value


df['nameOrig'] = df['nameOrig'].apply(replace_first_letter)
df['nameDest'] = df['nameDest'].apply(replace_first_letter)

In [4]:
df.head()

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,0,1,4,9839.64,11231006815,170136.0,160296.36,21979787155,0.0,0.0,0
1,1,1,4,1864.28,11666544295,21249.0,19384.72,22044282225,0.0,0.0,0
2,2,1,5,181.0,11305486145,181.0,0.0,1553264065,0.0,0.0,1
3,3,1,2,181.0,1840083671,181.0,0.0,138997010,21182.0,0.0,1
4,4,1,4,11668.14,12048537720,41554.0,29885.86,21230701703,0.0,0.0,0


In [5]:
X = df.drop('isFraud', axis='columns')
y = df.isFraud

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
acc_scorer = make_scorer(accuracy_score)

In [23]:
random_forest = RandomForestClassifier()

parameters = {'n_estimators': [100],'criterion': ['entropy'], 'max_depth': [20]}

grid_obj = GridSearchCV(random_forest, parameters, scoring=acc_scorer,cv=3)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
print(grid_obj.best_estimator_)

clf.fit(X_train, y_train)
print(f"Accuracy Apprentissage: {clf.score(X_train,y_train)}")
print(f"Accuracy Test: {clf.score(X_test,y_test)}")

RandomForestClassifier(criterion='entropy', max_depth=20)
Accuracy Apprentissage: 0.9999858306565081
Accuracy Test: 0.9998214663563267


In [24]:
filename = 'RandomForest.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [25]:
smote = SMOTE(random_state=42, k_neighbors=5)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Accuracy SMOTE: {clf.score(X_train_resampled,y_train_resampled)}")

Accuracy SMOTE: 0.994776844391108


In [26]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1] # Probabilités pour AUC

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC: {auc}")

Precision: 0.9991575400168492
Recall: 0.9785478547854786
F1 Score: 0.9887453105460609
AUC: 0.9993789881921133


In [27]:
filename = 'RandomForest.pkl'
pickle.dump(clf, open(filename, 'wb'))