In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import roc_auc_score
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import mutual_info_regression
from datetime import datetime

%matplotlib inline
sns.set()

In [None]:
conn = sqlite3.connect('../Data/db/fraude_detection_warehouse_.db')

alert = pd.read_sql_query("SELECT * FROM alerts", conn)
customers = pd.read_sql_query("SELECT * FROM customers", conn)
devices = pd.read_sql_query("SELECT * FROM devices", conn)
transaction_history = pd.read_sql_query("SELECT * FROM transaction_history", conn)
transaction_patterns= pd.read_sql_query("SELECT * FROM transaction_patterns", conn)
transactions = pd.read_sql_query("SELECT * FROM transactions", conn)
transactions.head()

In [None]:
# Fusion des tables
df1 = transactions.merge(customers, on='customer_id', how='left').drop(columns=['first_name', 'last_name', 'transaction_date'])
df2 = df1.merge(devices, on='device_id', how='left')
df3 = df2.merge(transaction_history, on=['customer_id', 'transaction_id'], how='left')

df3.head()

In [None]:
df3.info()

In [None]:
date_columns = ["registration_date", "date_of_birth", "transaction_date", "last_used"]

for column in date_columns:
    df3[column] = pd.to_datetime(df3[column])

df3.info()  

In [None]:
data = df3.copy()
data.drop(columns=["transaction_id", "customer_id", "device_id", "email", "phone_number", "address", "history_id"], inplace=True)
data.head()

## Feature engineering

In [None]:
scaler = StandardScaler()
num_columns = list(data.select_dtypes(["float64"]).columns)
for col in num_columns:
    data[col] = scaler.fit_transform(data[[col]])

In [None]:
# Encodage des variables categorielles
le = LabelEncoder()
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = le.fit_transform(data[col].astype(str))

In [None]:
data.info()

In [None]:
# Fonction pour extraire les composantes de date
def extract_date_features(data, column):
    data[f'{column}_day'] = data[column].dt.day
    data[f'{column}_month'] = data[column].dt.month
    data[f'{column}_year'] = data[column].dt.year
    data[f'{column}_weekday'] = data[column].dt.weekday
    data[f'{column}_quarter'] = data[column].dt.quarter
    return data

# Conversion des dates et extraction des caractéristiques
date_columns = ['transaction_date','last_used', 'registration_date'] # 'date_of_birth'
for col in date_columns:
    data = extract_date_features(data, col)
    
    # Suppression de la colonne de date originale
    data = data.drop(columns=[col])

In [None]:
print(data.columns)

In [None]:
# Sélectionner uniquement les colonnes de date nouvellement créées
date_columns = [col for col in data.columns if any(x in col for x in ['_day', '_month', '_year', '_weekday', '_quarter'])]

# Afficher les premières lignes de ces colonnes
print(data[date_columns].head(10))

In [None]:
date_of_today = datetime.now()

def calculate_and_assign(row):
  years_since_birth = int((date_of_today - row['date_of_birth']).days / 360)
  return pd.Series({'year_since_birth': years_since_birth})

data["year_since_birth"] = data.apply(calculate_and_assign, axis=1)
data.drop(columns=["date_of_birth"], inplace=True)
data.head()

In [None]:
lignes_avec_manquantes = df3[df3.isnull().any(axis=1)]

nombre_lignes_manquantes = lignes_avec_manquantes.shape[0]

print("Nombre de lignes avec des valeurs manquantes :", nombre_lignes_manquantes)

In [None]:
# Analyse de correlation
corr_matrix = data.corr()
plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Matrice de correlation')
plt.show()

In [None]:
# Selection des variables les plus correlees avec is_fraud
correlations_with_fraud = corr_matrix['is_fraud'].abs().sort_values(ascending=False)
print("Top 10 variables les plus corrélées avec is_fraud:")
print(correlations_with_fraud.head(6))

nous utiliserons les régions comme localisation par la suite

In [None]:
# Information mutuelle
X = data.drop(columns=['is_fraud']) #Contient les variables explicatives (indépendantes)
y = data['is_fraud'] #Contient la variable cible (dépendante)
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
print("\nTop 10 variables selon l'information mutuelle:")
print(mi_scores.head(5))

In [None]:
# Sélection finale des variables importantes
important_features = list(set(list(correlations_with_fraud.head(5).sort_values(ascending=False).index) + list(mi_scores.head(5).sort_values(ascending=False).index)))
important_features = [f for f in important_features if f != 'is_fraud']
print("\nVariables importantes sélectionnées:")
print(len(important_features))

fusionner les dataframes

In [None]:
# Creation du dataset final
X_final = data[important_features]
y_final = data['is_fraud']

In [None]:
# Entrainement du modele avec Isolation Forest
model = IsolationForest(contamination=0.5, random_state=42)
y_pred = model.fit_predict(X_final)
y_pred

In [None]:
# evaluons la precision du modele
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Convertir les prédictions de l'Isolation Forest (-1 pour anomalie, 1 pour normal) 
# en format binaire (1 pour anomalie, 0 pour normal)
#y_pred_binary = [1 if pred == -1 else 0 for pred in y_pred]

y_pred_binary = np.where(y_pred==-1, 0, y_pred)

# Calculer et afficher les métriques de classification
print(classification_report(y_final, y_pred_binary))

# Calculer l'AUC-ROC
auc_roc = roc_auc_score(y_final, y_pred_binary)
print(f"AUC-ROC: {auc_roc}")

In [None]:
print(y_pred_binary[:20])
print(y_pred[:20])

In [None]:
# Afficher la matrice de confusion
print("Matrice de confusion:")
print(confusion_matrix(y_final, y_pred_binary))

In [None]:
# Affichage des résultats
print("\nNombre d'anomalies détectées:", sum(y_pred == -1))

In [None]:
X_final

**Feature selection**

In [None]:
test = SelectKBest(score_func=chi2)
fit = test.fit(data.drop(columns="is_fraud").abs(), list(y_final))

In [None]:
np.set_printoptions(precision=3)
features = fit.transform(data.drop(columns="is_fraud"))
print(features[0:5, :])
feat = pd.DataFrame()
feat["num_features"] = data.drop(columns="is_fraud").columns
feat["score"] = fit.scores_
feat.sort_values(by="score", ascending=False, inplace=True)
feat = feat.reset_index().drop(columns=["index"])
feat

In [None]:

def model_fit_transform(features):
    X  = data[features]
    model_1 = IsolationForest(contamination=0.5, random_state=42)
    model_1.fit(X)

    pred_1= model_1.predict(X)
    pred_1 = np.where(pred_1==1, 0, pred_1)
    pred_1 = np.where(pred_1==-1, 1, pred_1)
    print(f"{pred_1[:10]}\n\n")

    #evaluation
    print(roc_auc_score(y_final, pred_1))

In [None]:
feature_imp = list(feat.head(4).num_features)
print(len(important_features))
print(len(feature_imp))

In [None]:
model_fit_transform(important_features)

In [None]:
model_fit_transform(feature_imp)

In [None]:
len(list(data.columns))

## Classification des customers par cluster

In [None]:
customers.head()

In [None]:
customers["region_e"] = le.fit_transform(customers.region)
customers.head()

In [None]:
data.head()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(data.drop(columns=["anomalie", "transaction_id", "customer_id", "email", "transaction_date", "phone_number"]))

In [None]:
kmeans = KMeans(random_state=42)
kmeans.fit_transform(X_pca)

In [None]:
labels = kmeans.labels_
centers = kmeans.cluster_centers_

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, cmap='viridis')