## Analyze of the dataset

In [None]:
import pandas as pd

df = pd.read_csv('card_transdata.csv')

print(df.head())

In [None]:
df.describe()

In [None]:
# valeurs manquantes
df.isnull().sum()

In [None]:
non_binary_fraud_rows = df[(df['fraud'] != 0) & (df['fraud'] != 1)]

non_binary_fraud_rows

## Test with models

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [6]:
fraud_df = df[df['fraud'] == 1]
non_fraud_df = df[df['fraud'] == 0].sample(n=len(fraud_df), random_state=13)

dfc = pd.concat([fraud_df, non_fraud_df])

In [7]:
X = dfc.drop(columns='fraud')
y = dfc['fraud']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Logistic Regression

In [10]:
logistic_model = LogisticRegression(random_state=13)
logistic_model.fit(X_train_scaled, y_train)
y_pred_logistic = logistic_model.predict(X_test_scaled)

logistic_report = classification_report(y_test, y_pred_logistic, output_dict=True)

In [None]:
logistic_report

In [None]:
importance_logistic = logistic_model.coef_[0]

logistic_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importance_logistic
})

logistic_importance_df = logistic_importance_df.sort_values(by='Importance', ascending=False)

print("Logistic Regression Feature Importance:\n", logistic_importance_df)

#### Decision tree

In [12]:
decision_tree_model = DecisionTreeClassifier(random_state=13)
decision_tree_model.fit(X_train_scaled, y_train)
y_pred_tree = decision_tree_model.predict(X_test_scaled)

tree_report = classification_report(y_test, y_pred_tree, output_dict=True)

In [None]:
tree_report

In [None]:
importance_tree = decision_tree_model.feature_importances_

tree_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importance_tree
})

tree_importance_df = tree_importance_df.sort_values(by='Importance', ascending=False)

print("Decision Tree Feature Importance:\n", tree_importance_df)

#### Random Forest

In [14]:
rf_model = RandomForestClassifier(random_state=13)
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_test_scaled)

rf_report = classification_report(y_test, y_pred_rf, output_dict=True)

In [None]:
rf_report

In [None]:
importance_rf = rf_model.feature_importances_

rf_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importance_rf
})

rf_importance_df = rf_importance_df.sort_values(by='Importance', ascending=False)

print("Random Forest Feature Importance:\n", rf_importance_df)