# PROJET MACH_BDA_DATAVIZ

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import altair as alt
from pathlib import Path
from datasets import load_dataset
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
dataset = load_dataset("d0r1h/customer_churn")

df = pd.DataFrame(dataset['train'])

df_reset = df.copy() # copie pour ne pas recharger tout le dataset

In [None]:
def reset_dataframe():
    return df_reset

In [None]:
df.iloc[0]

## Traitement des Données

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
def clean_dataframe(df):
    df.dropna(inplace=True)
    df.drop(columns=['security_no', 'referral_id'],inplace=True)
    
    df['medium_of_operation'] = df['medium_of_operation'].replace('?', 'Unknown')
    
    df = df[df['days_since_last_login'] > 0]
    df = df[df['avg_time_spent'] > 0]
    df = df[df['points_in_wallet'] > 0]
    df = df[df['avg_frequency_login_days'] != "Error"]
    
    df['avg_frequency_login_days'] = pd.to_numeric(df['avg_frequency_login_days'])
    
    df['joining_date'] = pd.to_datetime(df['joining_date'], format='%d-%m-%Y')
    df['last_visit_time'] = pd.to_datetime(df['last_visit_time'], format='%H:%M:%S').dt.time

    df['joining_year'] = df['joining_date'].dt.year
    df['joining_month'] = df['joining_date'].dt.month
    df['joining_day'] = df['joining_date'].dt.day

    df['last_visit_hour'] = df['last_visit_time'].apply(lambda x: x.hour)
    df['last_visit_minute'] = df['last_visit_time'].apply(lambda x: x.minute)

    df.drop(columns=['joining_date', 'last_visit_time'], inplace=True)
    
    return df

Standardisation des données numériques (mettre toutes les valeurs entre 0 et 1, pas sûr s'il y a vraiment besoin):

In [None]:
def normalize_numeric_columns(df):
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = MinMaxScaler().fit_transform(df[numeric_columns])
    return df

Encodage des données non numériques:

In [None]:
def encode_columns(df):
    non_numeric_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    dict_encode = { value: index for index, value in enumerate(non_numeric_columns) }
    
    encoded_df = df.copy()
    
    for col in dict_encode:
        l_occurences = df[col].unique()
        dict_encode[col] = { value: index for index, value in enumerate(l_occurences) }
        encoded_df[col] = encoded_df[col].apply(lambda x : dict_encode[col][x])
    
    return encoded_df

In [None]:
df = reset_dataframe()

In [None]:
df = clean_dataframe(df)

## Analyse et visualisation exploratoire des Données

#### Matrice de corrélation pour voir si certaines features sont redondantes

In [None]:
df = encode_columns(df)

corr_matrix = df.corr().reset_index().melt('index')
corr_matrix.columns = ['Variable1', 'Variable2', 'Correlation']
corr_matrix['Correlation'] = corr_matrix['Correlation'].round(2)

heatmap = alt.Chart(corr_matrix).mark_rect().encode(
    x='Variable1:O',
    y='Variable2:O',
    color=alt.Color('Correlation:Q', scale=alt.Scale(scheme='turbo')),
    tooltip=['Variable1', 'Variable2', 'Correlation']
).properties(
    width=1000,
    height=1000
)

text = heatmap.mark_text(baseline='middle').encode(
    text='Correlation:Q',
    color=alt.condition(
        alt.datum.Correlation > 0.5,
        alt.value('white'),
        alt.value('black'),
        scale=alt.Scale(domain=[0, 1], range=['black', 'white'])
    )
)
heatmap + text

In [None]:
# Désactiver la limite de lignes
alt.data_transformers.disable_max_rows()

df = reset_dataframe()

#### Age des clients

In [None]:
hist_age = alt.Chart(df).mark_rect().encode(
    alt.X('age:Q', bin=alt.Bin(maxbins=30), title='Âge'),
    alt.Y('count()', title='Fréquence'),
    color=alt.Color('age', scale=alt.Scale(scheme='viridis'))
).properties(
    title='Distribution de l\'âge des clients',
    width=600,
    height=400
)

hist_age.display()

#### Distribution des genres

In [None]:
pie_gender = alt.Chart(df).mark_arc(outerRadius=120).encode(
    theta='count()',
    color='gender:N',
    tooltip=['gender', 'count()']
).properties(
    title='Répartition des genres'
)

pie_gender

# la partie unknown est très petite, elle est en haut avec 35 instances

#### Catégorie de région

In [None]:
bar_region = alt.Chart(df).mark_bar().encode(
    x=alt.X('count()', title='Fréquence'),
    y=alt.Y('region_category:N', title='Catégorie de région'),
    color='region_category:N'
).properties(
    title='Distribution par catégorie de région',
    width=600,
    height=400
)

bar_region.display()

#### Catégorie d'abonnement

In [None]:
bar_membership = alt.Chart(df).mark_bar().encode(
    x=alt.X('count()', title='Fréquence'),
    y=alt.Y('membership_category:N', title='Catégorie d\'abonnement'),
    color=alt.Color('age', scale=alt.Scale(scheme='turbo'))
).properties(
    title='Distribution par catégorie d\'abonnement',
    width=600,
    height=400
)

bar_membership.display()

#### Feedback

In [None]:
feedback_counts = df['feedback'].value_counts().reset_index()
feedback_counts.columns = ['feedback', 'count']

color_scale = alt.Scale(
    domain=[
        'No reason specified', 'Poor Customer Service', 'Poor Product Quality',
        'Poor Website', 'Too many ads', 'Products always in Stock', 'Quality Customer Care',
        'Reasonable Price', 'User Friendly Website'
    ],
    range=[
        '#00008b', '#ff0000', '#ff0000', '#ff0000', '#ff0000', '#00ff00', '#00ff00', '#00ff00', '#00ff00'
    ]
)

bar_feedback = alt.Chart(feedback_counts).mark_bar().encode(
    x='feedback:N',
    y='count:Q',
    tooltip=['feedback', 'count'],
    color=alt.Color('feedback:N', scale=color_scale)
).properties(
    title='Répartition des feedbacks',
    width=600,
    height=400
)

bar_feedback.display()

#### Points dans le portefeuille

In [None]:
hist_points_wallet = alt.Chart(df).mark_rect().encode(
    alt.X('points_in_wallet:Q', bin=alt.Bin(maxbins=30), title='Points dans le portefeuille'),
    alt.Y('count()', title='Nombre de clients'),
    color=alt.Color('points_in_wallet', scale=alt.Scale(scheme='blues'))
).properties(
    title='Distribution des points dans le portefeuille',
    width=600,
    height=400
)

hist_points_wallet.display()

## Evaluation de plusieurs modèles avec les bonnes métriques

In [None]:
df = encode_columns(df)
df = normalize_numeric_columns(df)

In [None]:
X = df.drop(columns=['churn_risk_score'])
y = df['churn_risk_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'AdaBoost': AdaBoostClassifier(),
}

results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    results[model_name] = accuracy

for model_name, accuracy in results.items():
    print(f"{model_name} Accuracy: {accuracy:.2f}")


Optimisation des meilleurs modèles (GDBoosting et RandomForest) pour améliorer leur accuracy:

In [None]:
# model = GradientBoostingClassifier()

# param_dist = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'subsample': [0.8, 0.9, 1.0]
# }

# random_search_gdb = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
#                                    n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42, scoring='accuracy')

# random_search_gdb.fit(X_train, y_train)

In [None]:
# print("Best parameters found: ", random_search_gdb.best_params_)

# print("Best cross-validation score: {:.2f}".format(random_search_gdb.best_score_))

# best_model_gdb = random_search_gdb.best_estimator_
# y_pred_gdb = best_model_gdb.predict(X_test)
# accuracy_gdb = accuracy_score(y_test, y_pred_gdb)

# print("Test set accuracy: {:.2f}".format(accuracy_gdb))

Best parameters found: {'subsample': 1.0, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.1}
Best cross-validation score: 0.94
Test set accuracy: 0.95

In [None]:
# model = RandomForestClassifier()

# param_distributions = {
#     'n_estimators': np.arange(100, 501, 50),
#     'max_depth': np.arange(3, 21, 2),
#     'min_samples_split': np.arange(2, 21, 2),
#     'min_samples_leaf': np.arange(1, 11, 1),
#     'bootstrap': [True, False]
# }

# random_search_rf = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# random_search_rf.fit(X_train, y_train)

In [None]:
# print("Best parameters found: ", random_search_rf.best_params_)

# print("Best cross-validation score: {:.2f}".format(random_search_rf.best_score_))

# best_model_rf = random_search_rf.best_estimator_
# y_pred_rf = best_model_rf.predict(X_test)
# accuracy_rf = accuracy_score(y_test, y_pred_rf)

# print("Test set accuracy: {:.2f}".format(accuracy_rf))

Best parameters found:  {'n_estimators': 250, 'min_samples_split': 16, 'min_samples_leaf': 3, 'max_depth': 19, 'bootstrap': False}
Best cross-validation score: 0.94
Test set accuracy: 0.94

In [None]:
def features_importance(model,coeff_method=None):
    if coeff_method == 'arbres':
        feature_importances = model.feature_importances_
    else:
        feature_importances = model.coef_[0]
    
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    return feature_importance_df

In [None]:
feature_importance_df = features_importance(models['Random Forest'], 'arbres')

In [None]:
feature_importance_df

## Visualiser et interpreter les résultats

In [None]:
feature_importance_chart = alt.Chart(feature_importance_df).mark_bar().encode(
    x=alt.X('Importance:Q', title='Importance'),
    y=alt.Y('Feature:O', title='Feature', sort='-x'),
    tooltip=['Feature', 'Importance']
).properties(
    title='Feature Importances',
    width=600,
    height=400
)

feature_importance_chart.show()