# PROJET MACH_BDA_DATAVIZ

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
from datasets import load_dataset
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
dataset = load_dataset("d0r1h/customer_churn")

df = pd.DataFrame(dataset['train'])

df_reset = df # copie pour ne pas recharger tout le dataset

In [None]:
def reset_dataframe():
    return df_reset

In [None]:
df.iloc[0]

## Traitement des Données

In [None]:
df.isnull().sum()

In [None]:
def clean_dataframe(df):
    df.dropna(inplace=True)
    df.drop(columns=['security_no', 'referral_id'],inplace=True)
    
    df['medium_of_operation'] = df['medium_of_operation'].replace('?', 'Unknown')
    
    df = df[df['days_since_last_login'] > 0]
    df = df[df['avg_time_spent'] > 0]
    df = df[df['points_in_wallet'] > 0]
    
    df['joining_date'] = pd.to_datetime(df['joining_date'], format='%d-%m-%Y')
    df['last_visit_time'] = pd.to_datetime(df['last_visit_time'], format='%H:%M:%S').dt.time

    df['joining_year'] = df['joining_date'].dt.year
    df['joining_month'] = df['joining_date'].dt.month
    df['joining_day'] = df['joining_date'].dt.day

    df['last_visit_hour'] = df['last_visit_time'].apply(lambda x: x.hour)
    df['last_visit_minute'] = df['last_visit_time'].apply(lambda x: x.minute)

    df.drop(columns=['joining_date', 'last_visit_time'], inplace=True)
    
    return df

Standardisation des données numériques (mettre toutes les valeurs entre 0 et 1, pas sûr s'il y a vraiment besoin):

In [None]:
def normalize_numeric_columns(df):
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = MinMaxScaler().fit_transform(df[numeric_columns])
    return df

Encodage des données non numériques:

In [None]:
def encode_columns(df, binary_columns):
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    one_hot_encoder_cols = [col for col in non_numeric_columns if col not in binary_columns]

    for col in binary_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    ct = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(), one_hot_encoder_cols)
    ], remainder='passthrough')

    df_encoded = ct.fit_transform(df)
    
    onehot_feature_names = ct.named_transformers_['onehot'].get_feature_names_out(one_hot_encoder_cols)
    all_feature_names = list(onehot_feature_names) + [col for col in df.columns if col not in one_hot_encoder_cols]
    
    encoded_df = pd.DataFrame(df_encoded, columns=all_feature_names)
    
    return encoded_df

In [None]:
df = clean_dataframe(df)
df = normalize_numeric_columns(df)
# encoded_df = encode_columns(df, binary_columns=['gender', 'joined_through_referral', 'used_special_discount', 'offer_application_preference', 'past_complaint'])

Je n'ai pas encore reussi a encoder les colonnes non numeriques

## Analyse et visualisation exploratoire des Données

## Evaluation de plusieurs modèles avec les bonnes métriques

In [None]:
df = df.drop(columns=df.select_dtypes(exclude=['float64', 'int64']).columns)

In [None]:
class PrevisionChurn:
    def __init__(self, model):
        self.model = model
    
    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
    
    def predict(self, X_test):
        return self.model.predict(X_test)
    
    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return accuracy_score(y_test, y_pred)

In [None]:
X = df.drop(columns=['churn_risk_score'])
y = df['churn_risk_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()

predictive_model = PrevisionChurn(model)

predictive_model.train(X_train, y_train)

accuracy = predictive_model.evaluate(X_test, y_test)
print("Accuracy:", np.round(accuracy * 100,2))


In [None]:
feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

## Visualiser et interpreter les résultats

In [None]:
plt.figure(figsize=(6, 3))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()