In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from datetime import datetime, timedelta

%matplotlib inline
sns.set()

In [None]:
conn = sqlite3.connect('../Data/db/fraude_detection_warehouse_.db')
transactions = pd.read_sql("SELECT * FROM transactions", conn)
transactions.head()

In [None]:
customers = pd.read_sql("SELECT * FROM customers", conn)
customers.head()

In [None]:
devices = pd.read_sql("SELECT * FROM devices", conn)
devices.head()

In [None]:
locations = pd.read_csv("../Data/worldcities.csv")

## Feature engineering

determination des features à utiliser

In [None]:
trans_features = ["transaction_id", "device_id", "customer_id", "transaction_date", "amount", "transaction_type", "status", "location", "is_fraud"]
dev_features = ["device_id", "device_type", "os"]
cust_features = ["customer_id", "date_of_birth", "registration_date", "region", "email", "phone_number"]

extraire des différentes tables

In [None]:
transactions = transactions[trans_features]
devices = devices[dev_features]
customers = customers[cust_features]

display(transactions.head())
print("\n")
display(devices.head())
print("\n")
display(customers.head())

calcul de l'age des customers

In [None]:
customers.date_of_birth = pd.to_datetime(customers.date_of_birth)

In [None]:
customers.registration_date = pd.to_datetime(customers.registration_date)

In [None]:
date_of_today = datetime.now()

def calcul_date(registration_date, date_of_birth):
    difference_registration = date_of_today - registration_date
    difference_birth = date_of_today - date_of_birth
    return int(difference_registration.days/360), int(difference_birth.days/360)

In [None]:
customers[['year_since_registration', 'year_since_birth']] = customers.apply(
    lambda row: calcul_date(row['registration_date'], row['date_of_birth']), axis=1, result_type='expand'
)

In [None]:
customers.drop(columns=["date_of_birth", "registration_date"], inplace=True)
customers.head()

nous utiliserons les régions comme localisation par la suite

In [None]:
locations = locations[["city", "country", "admin_name"]]
locations = locations[locations.country.isin(["Cameroon", "Nigeria", "Gabon"])].drop(columns=["country"]).rename(columns={"city":"location"})
locations.head()

In [None]:
transactions_ = pd.merge(transactions, locations, on="location")
transactions_.head() 

fusionner les dataframes

In [None]:
data_1 = pd.merge(transactions_, customers, on="customer_id")
data_1.head()

In [None]:
data_2 = pd.merge(data_1, devices, on="device_id").drop(columns=["device_id", "location"])
data_2.rename(columns={"admin_name":"transaction_location", "region" : "region_customer"}, inplace=True)
data_2.head()

In [None]:
data_2["transaction_date"] = pd.to_datetime(data_2.transaction_date)

In [None]:
data_2.info()

In [None]:
data_2['week'] = data_2['transaction_date'].dt.isocalendar().week
data_2['month_number'] = data_2['transaction_date'].dt.month
data_2['year'] = data_2['transaction_date'].dt.year
data_2.head()

In [None]:
data = data_2.copy()

## Data preprocessing

In [None]:
le = LabelEncoder()
columns = ["transaction_type", "status", "transaction_location", "region_customer", "device_type", "os", "year"]
for column in columns:
    data[f"{column}_e"] = le.fit_transform(data[column])
    data.drop(columns = [column], inplace=True)
display(data.head())

In [None]:
scaler = StandardScaler()
data["amount_sc"] = scaler.fit_transform(data[["amount"]])
data.drop(columns = ["amount"], inplace=True)
data.head()

In [None]:
num_features = list(data.select_dtypes(["int64","float64"]).columns)
cat_features = data.select_dtypes(["object"])
num_features

In [None]:
sns.heatmap(data[num_features], annot=True)

In [None]:
final_features =["transaction_type_e", "status_e", "transaction_location_e", "region_customer_e", "device_type_e", "os_e", "amount_sc"]
X  = data[final_features]

model = IsolationForest(contamination=0.2, random_state=42)
model.fit(X)

In [None]:
data["anomalie"] = model.predict(X)
anomalie = data[data.anomalie==-1]
print(anomalie.shape)
anomalie.head()

## Classification des customers par cluster

In [None]:
customers["region_e"] = le.fit_transform(customers.region)
customers.head()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(customers[["year_since_birth", "year_since_registration", "region_e"]])

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit_transform(customers[["year_since_birth", "year_since_registration", "region_e"]])

In [None]:
labels = kmeans.labels_
centers = kmeans.cluster_centers_

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, cmap='viridis')

In [None]:
customers['center'] = kmeans.labels_
customers.head()