In [220]:
import pandas as pd
import tensorflow as tf
import numpy as np

from tensorflow.keras.layers import StringLookup, CategoryEncoding, Normalization, Lambda, Concatenate
from tensorflow.keras.models import Model

In [221]:
data = pd.read_csv('../data/cleaned_datas.csv')
df = data.copy()
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,female,0,yes,no,1,no,no phone service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic check,29.85,29.85,no
1,5575-GNVDE,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one year,no,mailed check,56.95,1889.50,no
2,3668-QPYBK,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed check,53.85,108.15,yes
3,7795-CFOCW,male,0,no,no,45,no,no phone service,dsl,yes,...,yes,yes,no,no,one year,no,bank transfer (automatic),42.30,1840.75,no
4,9237-HQITU,female,0,no,no,2,yes,no,fiber optic,no,...,no,no,no,no,month-to-month,yes,electronic check,70.70,151.65,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one year,yes,mailed check,84.80,1990.50,no
7039,2234-XADUH,female,0,yes,yes,72,yes,yes,fiber optic,no,...,yes,no,yes,yes,one year,yes,credit card (automatic),103.20,7362.90,no
7040,4801-JZAZL,female,0,yes,yes,11,no,no phone service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic check,29.60,346.45,no
7041,8361-LTMKD,male,1,yes,no,4,yes,yes,fiber optic,no,...,no,no,no,no,month-to-month,yes,mailed check,74.40,306.60,yes


In [222]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [223]:
unique_values = pd.DataFrame({
    'Variable': df.columns,
    'Unique Values': [list(df[col].unique()) for col in df.columns]
})


In [224]:
unique_values

Unnamed: 0,Variable,Unique Values
0,customerID,"[7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOC..."
1,gender,"[female, male]"
2,SeniorCitizen,"[0, 1]"
3,Partner,"[yes, no]"
4,Dependents,"[no, yes]"
5,tenure,"[1, 34, 2, 45, 8, 22, 10, 28, 62, 13, 16, 58, ..."
6,PhoneService,"[no, yes]"
7,MultipleLines,"[no phone service, no, yes]"
8,InternetService,"[dsl, fiber optic, no]"
9,OnlineSecurity,"[no, yes, no internet service]"


In [225]:
# Variables
id_col = "customerID"
target_col = "Churn"

# Colonnes numériques (à normaliser)
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
for col in df[numeric_features]:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df.fillna({col: 0}, inplace=True)

# Colonnes binaires (yes/no à encoder comme int)
binary_features = [
    "gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"
]
binary_numeric_features = ['SeniorCitizen']

# Colonnes catégorielles avec 3+ modalités (one-hot encoding)
multiclass_features = [
    "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
    "Contract", "PaymentMethod"
]

In [226]:
def get_preprocessing_layers(df):
    inputs = {}
    encoded_features = []
    encoded_column_names = []

    # Variables numériques
    for feature in numeric_features:
        inputs[feature] = tf.keras.Input(shape=(1,), name=feature)
        normalizer = Normalization()
        normalizer.adapt(np.array(df[feature]).reshape(-1, 1))
        encoded = normalizer(inputs[feature])
        encoded_features.append(encoded)
        encoded_column_names.append(feature)
        # print(f"Numeric feature '{feature}' -> 1 column")

    binary_vocab = {
    "gender": ["female", "male"],
    "Partner": ["yes", "no"],
    "Dependents": ["yes", "no"],
    "PhoneService": ["yes", "no"],
    "PaperlessBilling": ["yes", "no"],
    }
    # Variables binaires texte (ex: yes/no)
    for feature in binary_features:
        inputs[feature] = tf.keras.Input(shape=(1,), dtype=tf.string, name=feature)
        vocab = binary_vocab.get(feature, ["no", "yes"])  # fallback si pas dans dict
        lookup = StringLookup(vocabulary=vocab, output_mode="int", oov_token=None)
        encoded_int = lookup(inputs[feature])
        encoded = Lambda(lambda x: tf.cast(x - 1, tf.float32))(encoded_int)
        encoded_features.append(encoded)
        encoded_column_names.append(feature)
        # print(f"Binary feature '{feature}' -> 1 column")

    # Variables binaires numériques (0/1)
    for feature in binary_numeric_features:
        inputs[feature] = tf.keras.Input(shape=(1,), dtype=tf.int32, name=feature)
        encoded = Lambda(lambda x: tf.cast(x, tf.float32))(inputs[feature])
        encoded_features.append(encoded)
        encoded_column_names.append(feature)
        # print(f"Binary numeric feature '{feature}' -> 1 column")

    # Variables multiclasses (one-hot)
    for feature in multiclass_features:
        inputs[feature] = tf.keras.Input(shape=(1,), dtype=tf.string, name=feature)
        lookup = StringLookup(output_mode="int", oov_token="[UNK]")
        lookup.adapt(df[feature])
        encoded_int = lookup(inputs[feature])
        one_hot = CategoryEncoding(num_tokens=lookup.vocabulary_size(), output_mode="one_hot")
        encoded = one_hot(encoded_int)
        encoded_features.append(encoded)

        vocab = lookup.get_vocabulary()
        # Exclusion possible du token OOV si souhaité ici
        # Exemple : col_names = [f"{feature}_{cat}" for cat in vocab if cat != "[UNK]"]
        col_names = [f"{feature}_{cat}" for cat in vocab]
        encoded_column_names.extend(col_names)
        # print(f"Multiclass feature '{feature}' -> {len(col_names)} columns")

    concatenated = Concatenate()(encoded_features)
    preprocessing_model = Model(inputs=inputs, outputs=concatenated, name="preprocessing")

    return preprocessing_model, inputs, encoded_features, encoded_column_names


In [227]:
# Traitement à part pour la cible
target_lookup = tf.keras.layers.StringLookup(vocabulary=["no", "yes"], output_mode="int")
y = target_lookup(df[target_col].values) - 1  # tenseur ou numpy array

In [228]:
preprocessing_model, inputs, encoded_features, encoded_column_names = get_preprocessing_layers(df)

In [229]:
preprocessing_model

<Functional name=preprocessing, built=True>

In [230]:
# Utilise le preprocessing_model déjà construit et les variables existantes
df_inputs = {name: df[name].values for name in inputs}
encoded_array = preprocessing_model.predict(df_inputs, verbose=0)

# Si encoded_array est une liste de tableaux, concatène-les
if isinstance(encoded_array, list):
    encoded_concat = np.concatenate(encoded_array, axis=1)
else:
    encoded_concat = encoded_array

# Crée le DataFrame avec les bons noms de colonnes
encoded_df = pd.DataFrame(encoded_concat, columns=encoded_column_names)


In [231]:
encoded_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender,Partner,Dependents,PhoneService,PaperlessBilling,SeniorCitizen,MultipleLines_[UNK],...,StreamingMovies_no internet service,Contract_[UNK],Contract_month-to-month,Contract_two year,Contract_one year,PaymentMethod_[UNK],PaymentMethod_electronic check,PaymentMethod_mailed check,PaymentMethod_bank transfer (automatic),PaymentMethod_credit card (automatic)
0,-1.277444,-1.160323,-0.992611,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.066327,-0.259629,-0.172165,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-1.236724,-0.362660,-0.958066,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.514251,-0.746536,-0.193672,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-1.236724,0.197365,-0.938875,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,-0.340876,0.665992,-0.127605,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
7039,1.613701,1.277533,2.242606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7040,-0.870241,-1.168632,-0.852932,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7041,-1.155283,0.320338,-0.870513,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [232]:
encoded_df.to_csv('../data/encoded_df.csv', index=False)

### **Keras Normalization Layer**
La couche tf.keras.layers.Normalization est une couche de prétraitement qui permet de normaliser les données numériques en apprenant les statistiques (moyenne, écart-type) du jeu de données.

***Fonctionnement***
- Adaptation : la méthode .adapt(data) calcule la moyenne et l’écart-type à partir des données fournies.
- Transformation : lors de l’appel de la couche, elle normalise les données en appliquant la formule classique de normalisation z-score.

***Pourquoi utiliser cette couche ?***
- Robustesse : la normalisation est un prérequis important pour de nombreux algorithmes d’apprentissage, notamment les réseaux neuronaux, car elle facilite la convergence et stabilise l’entraînement.
- Automatisation : Keras propose une couche intégrée qui peut être insérée directement dans un pipeline, ce qui rend la normalisation réutilisable et intégrée dans un modèle TensorFlow/Keras.

***Limites / Points d’attention***
La normalisation dépend des données d’adaptation : si les statistiques changent (par exemple en production), il faudra re-adapter ou gérer dynamiquement.

Adaptée pour des données numériques continues, pas pour les données catégorielles.


### **Keras StringLookup + CategoryEncoding**
**StringLookup**
Sert à transformer des chaînes de caractères en indices entiers (encodage numérique).
Permet de gérer un vocabulaire dynamique (adapté au jeu de données).

Peut gérer un token hors-vocabulaire (OOV) pour les valeurs inconnues.

**CategoryEncoding**
Transforme les indices entiers en représentations one-hot ou multi-hot.
Utile pour représenter des variables catégorielles dans un format exploitable par les modèles de machine learning.

***Pourquoi utiliser cette combinaison ?***
- Modularité : chaque étape (lookup puis encodage) est claire, paramétrable et adaptée à la manipulation de données catégorielles dans TensorFlow.
- Performance : intégré à TensorFlow, ce processus est optimisé pour le calcul sur GPU et la compatibilité avec les modèles Keras.
- Sécurité : gestion explicite des tokens inconnus évite les erreurs liées aux catégories non vues en entraînement.

Alternatives classiques
pandas.get_dummies() ou sklearn.OneHotEncoder : hors TensorFlow, plus simple pour un usage hors pipeline TensorFlow.
tf.keras.layers.experimental.preprocessing.StringLookup et CategoryEncoding sont privilégiés dans un workflow Keras/TF pour garder la cohérence dans la pipeline et le modèle.