## Feature Engineering : part.2

**Ce script a pour but de traiter les données, créer des features en vue de l'élaboration d'un algorithme de détection de churn au sein des clients.**
Nous nous servons ici du dataset déjà traité dans le premier script de feature engineering.

#### Importation des librairies

In [1]:
import pandas as pd
import numpy as np
import os

pd.options.display.max_columns = None

  return f(*args, **kwds)
  return f(*args, **kwds)


#### Chargement des données

In [2]:
# Chemin vers le répertoire contenant les données
PATH_DATA = "../data"

In [40]:
# Lecture des données
def read_regular_donator(data_filepath):
    """
    Lit les données des donateurs réguliers sauvées via le premier script de feature engineering
    """
    donators_regular = pd.read_csv(os.path.join(data_filepath,'dons_reguliers_cleaned.csv'))
    dons_cols = ['DATE_DE_RECEPTION', 'ID_CONTACT', 'CAM_CODE', 'OFT_CODE', 'MONTANT_VENTILATION']
    donators_regular = donators_regular[dons_cols]
    return donators_regular

def read_contacts(data_filepath):
    """
    Lit les données de la base de donnée 'contacts' sauvées via le premier script de feature engineering
    """
    contact = pd.read_csv(os.path.join(data_filepath, 'dons_contact_cleaned.csv'))
    contact_cols = ['ID_CONTACT', 'TYPE_CONTACT', 'TRANCHE_AGE', 'ADRESSE_POSTALE', 'EMAIL',
                'TELEPHONE', 'OPTIN', 'STOP_TEL', 'STOP_MAILING', 'STOP_GENERAL', 'PA_ACTIF',
                'CYCLE_PA_ACTIF', 'CANAL_ACQUISITION_AGG' ]
    contact = contact[contact_cols]
    contact.CYCLE_PA_ACTIF = contact.CYCLE_PA_ACTIF.fillna('M')
    contact = contact.drop_duplicates()
    return contact

def read_mails(data_filepath):
    """
    Lit les données de la base de donnée 'mail' sauvées via le premier script de feature engineering
    """
    mails = pd.read_csv(os.path.join(data_filepath, 'mails_cleaned.csv'))
    return mails

dons_reg = read_regular_donator(PATH_DATA)
contacts = read_contacts(PATH_DATA)
mails = read_mails(PATH_DATA)
dons_reg.head()

  if (yield from self.run_code(code, result)):


Unnamed: 0,DATE_DE_RECEPTION,ID_CONTACT,CAM_CODE,OFT_CODE,MONTANT_VENTILATION
0,2014-01-10,713843,PA,DON,10.0
1,2014-01-10,818435,PA,DON,7.0
2,2014-01-10,811465,PA,DON,10.0
3,2014-01-10,818439,PA,DON,10.0
4,2014-01-10,818443,PA,DON,0.0


#### Traitement des données

In [4]:
def process_mails(df_mails):
    """
    Crée des features propres à la base de données des mails
    """
    df_mails['RATIO_OPEN'] = np.where(df_mails.OPEN != 0, df_mails.OPEN / df_mails.SENT, 0)
    df_mails['RATIO_CLICK'] = np.where(df_mails.CLICK != 0, df_mails.CLICK / df_mails.SENT, 0)
    mail_cols =  ['ID', 'RATIO_OPEN', 'RATIO_CLICK']
    df_mails = df_mails[mail_cols]
    return df_mails

In [41]:
def adjust_scope(df_dons, df_contacts, date_filtre):
    """
    Ré-ajuste le périmètre pour enlever les dons avant que les clients deviennent réguliers.
    Calcule également la date du premier et dernier prélèvement.
    On garde les donateurs encore à une certaine date
    """
    # On calcule la première et dernière date de prélèvement
    date_pa = df_dons.loc[df_dons.CAM_CODE == 'PA'][['ID_CONTACT', 'DATE_DE_RECEPTION']].groupby('ID_CONTACT').agg(['min', 'max'])
    date_pa.rename({'min' : 'FIRST_TRANSACTION', 'max' : 'LAST_TRANSACTION'}, level=1, inplace=True, axis=1)
    date_pa.columns = date_pa.columns.droplevel()
    date_pa['ID_CONTACT'] = date_pa.index
    date_pa = date_pa.reset_index(drop=True)
    
    # On filtre en ne gardant que les dons compris entre ces dates
    filtre_don = df_dons.merge(date_pa, how='left', on='ID_CONTACT')
    dons_scope = filtre_don.loc[(filtre_don.FIRST_TRANSACTION <= filtre_don.DATE_DE_RECEPTION) &
                                (filtre_don.DATE_DE_RECEPTION <= filtre_don.LAST_TRANSACTION)]
    
    # On ne garde que les donateurs encore présents à une certaine date
    dons_scope = dons_scope.loc[dons_scope.LAST_TRANSACTION >= date_filtre]
    
    # On ne garde que les donateurs mensuels
    df_merge = df_contacts[['ID_CONTACT', 'CYCLE_PA_ACTIF']]
    df_merge = df_merge.drop_duplicates()
    dons_scope = dons_scope.merge(df_merge, on='ID_CONTACT', how='left')
    dons_scope = dons_scope.loc[dons_scope.CYCLE_PA_ACTIF == 'M']
    dons_scope = dons_scope.drop(['CYCLE_PA_ACTIF'], axis=1)
   
    return dons_scope

dons_reg = adjust_scope(dons_reg, contacts, date_filtre='2018-11-01' )

In [42]:
def compute_churn(dons, date_churn):
    """
    Fonction qui labellise les donateurs selon churneurs et non.
    """
    dona_churn = dons.loc[:,['ID_CONTACT']].drop_duplicates()
    dona_churn['CHURN'] = dons['LAST_TRANSACTION'].map(lambda el : 1 if el < date_churn else 0)
    
    return dona_churn

data = compute_churn(dons_reg, '2018-12-01')
data.head()

Unnamed: 0,ID_CONTACT,CHURN
0,715647,0
1,818451,0
2,818455,0
3,813658,0
4,716312,0


## Création de nouvelles features
##### Features intensives

In [43]:
def add_feature_info_dons(data, dons):
    """
    Rajoute les infos des différents types de dons réalisés par client
    """
    # Calcul des montants par type de dons
    info_reg = dons[(dons.CAM_CODE == 'PA') & (dons.MONTANT_VENTILATION != 0)][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['mean', 'std'])
    info_occ = dons[~(dons.CAM_CODE.isin(['EVENT', 'TW', 'WTR', 'PA'])) & (dons.MONTANT_VENTILATION != 0)][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['mean', 'std'])
    info_event = dons[dons.CAM_CODE.isin(['EVENT', 'TW', 'WTR'])][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['mean', 'std'])
    
    # On supprime les niveaux induits par les aggrégations
    info_reg.rename({'mean' : 'MEAN_PA', 'std' : 'STD_PA'}, level=1, inplace=True, axis=1)
    info_reg.columns = info_reg.columns.droplevel()

    info_occ.rename({'mean' : 'MEAN_OCC', 'std' : 'STD_OCC'}, level=1, inplace=True, axis=1)
    info_occ.columns = info_occ.columns.droplevel()

    info_event.rename({'mean' : 'MEAN_EVENT', 'std' : 'STD_EVENT'}, level=1, inplace=True, axis=1)
    info_event.columns = info_event.columns.droplevel()
    
    # Fusion des dataframes
    if 'MEAN_PA' not in data.columns:
        data = data.merge(info_reg, how='left', on='ID_CONTACT')
    if 'MEAN_OCC' not in data.columns:
        data = data.merge(info_occ, how='left', on='ID_CONTACT')
    if 'MEAN_EVENT' not in data.columns:
        data = data.merge(info_event, how='left', on='ID_CONTACT')
        
    # Les valeurs NaN correspondent à la variance des transactions uniques
    data = data.fillna(0)
    
    return data
    
data = add_feature_info_dons(data, dons_reg)
data.head()

Unnamed: 0,ID_CONTACT,CHURN,MEAN_PA,STD_PA,MEAN_OCC,STD_OCC,MEAN_EVENT,STD_EVENT
0,715647,0,9.0,0.0,0.0,0.0,0.0,0.0
1,818451,0,11.152542,0.99678,0.0,0.0,0.0,0.0
2,818455,0,10.0,0.0,0.0,0.0,0.0,0.0
3,813658,0,11.152542,0.99678,0.0,0.0,0.0,0.0
4,716312,0,10.932203,1.127469,0.0,0.0,0.0,0.0


In [44]:
def add_feature_contact(data, contact):
    """
    Jointure avec la base de données de contact et récupération des features
    """
    
    if 'TYPE_CONTACT' not in data.columns:
        data = data.merge(contact, how='left', on='ID_CONTACT')

    # Nettoyage des données
    data.TYPE_CONTACT = data.TYPE_CONTACT.fillna('PHYSIQUE')
    data.TRANCHE_AGE = data.TRANCHE_AGE.fillna('NO_INFO')
    data.ADRESSE_POSTALE = data.ADRESSE_POSTALE.fillna(1)
    data.EMAIL = data.EMAIL.fillna(1)
    data.TELEPHONE = data.TELEPHONE.fillna(1)
    data.OPTIN = data.OPTIN.fillna(1)
    data.STOP_TEL = data.STOP_TEL.fillna(0)
    data.STOP_MAILING = data.STOP_MAILING.fillna(0)
    data.STOP_GENERAL = data.STOP_GENERAL.fillna(0)
    data.CANAL_ACQUISITION_AGG = data.CANAL_ACQUISITION_AGG.fillna('Street')
    
    # Suppression des colonnes inutiles
    data = data.drop(['CYCLE_PA_ACTIF', 'PA_ACTIF'], axis=1)
    
    return data

data = add_feature_contact(data, contacts)
data.head()

Unnamed: 0,ID_CONTACT,CHURN,MEAN_PA,STD_PA,MEAN_OCC,STD_OCC,MEAN_EVENT,STD_EVENT,TYPE_CONTACT,TRANCHE_AGE,ADRESSE_POSTALE,EMAIL,TELEPHONE,OPTIN,STOP_TEL,STOP_MAILING,STOP_GENERAL,CANAL_ACQUISITION_AGG
0,715647,0,9.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street
1,818451,0,11.152542,0.99678,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1.0,1.0,1.0,1.0,1.0,0.0,0.0,Street
2,818455,0,10.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street
3,813658,0,11.152542,0.99678,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street
4,716312,0,10.932203,1.127469,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street


In [45]:
def add_feature_mail(data, mail):
    """
    Jointure avec la base de données mails et récupération des features
    """
    
    if 'RATIO_OPEN' not in data.columns:
        data = data.merge(mail, how='left', on='ID_CONTACT')

    # Nettoyage des données
    data.RATIO_OPEN = data.RATIO_OPEN.fillna(data.RATIO_OPEN.mean())
    data.RATIO_CLICK = data.RATIO_CLICK.fillna(data.RATIO_CLICK.mean())
    
    return data

data = add_feature_mail(data, mails)
data.head()

Unnamed: 0,ID_CONTACT,CHURN,MEAN_PA,STD_PA,MEAN_OCC,STD_OCC,MEAN_EVENT,STD_EVENT,TYPE_CONTACT,TRANCHE_AGE,ADRESSE_POSTALE,EMAIL,TELEPHONE,OPTIN,STOP_TEL,STOP_MAILING,STOP_GENERAL,CANAL_ACQUISITION_AGG,RATIO_CLICK,RATIO_OPEN
0,715647,0,9.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.0,0.0
1,818451,0,11.152542,0.99678,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1.0,1.0,1.0,1.0,1.0,0.0,0.0,Street,0.009259,0.240741
2,818455,0,10.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.009259,0.546296
3,813658,0,11.152542,0.99678,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.0,0.057143
4,716312,0,10.932203,1.127469,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.0,0.027523


##### Feature extensives
On calcule ces valeurs sur les 3 derniers mois (on peut faire l'hypothèse que le comportement récent des clients a un impact sur leur taux de churn).

In [67]:
def add_feature_last_dons(data, dons):
    """
    On rajoute les montants dépensés dans les 3 derniers mois
    """
    # On calcule des informations sur les dons des 3 derniers mois
    dons['DATE_OFFSET'] = pd.to_datetime(dons.DATE_DE_RECEPTION, format='%Y-%m-%d', errors='coerce') + pd.DateOffset(months=3)
    dons.LAST_TRANSACTION = pd.to_datetime(dons.LAST_TRANSACTION, format='%Y-%m-%d', errors='coerce')
    last_dons = dons.loc[dons.DATE_OFFSET > dons.LAST_TRANSACTION]
    last_dons = last_dons[['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['mean', 'count'])
    
    # On supprime les niveaux induits par les aggrégations
    last_dons.rename({'mean' : 'MEAN_3LAST', 'count' : 'NB_3LAST'}, level=1, inplace=True, axis=1)
    last_dons.columns = last_dons.columns.droplevel()
    
    # Fusion des dataframes et remplissage des valeurs manquantes
    if 'MEAN_3LAST' not in data.columns:
        data = data.merge(last_dons, how='left', on='ID_CONTACT')
        
    # Remplissage des valeurs manquantes
    data = data.fillna(0)
    return data

data = add_feature_last_dons(data, dons_reg)
data.head()

Unnamed: 0,ID_CONTACT,CHURN,MEAN_PA,STD_PA,MEAN_OCC,STD_OCC,MEAN_EVENT,STD_EVENT,TYPE_CONTACT,TRANCHE_AGE,ADRESSE_POSTALE,EMAIL,TELEPHONE,OPTIN,STOP_TEL,STOP_MAILING,STOP_GENERAL,CANAL_ACQUISITION_AGG,RATIO_CLICK,RATIO_OPEN,MEAN_3LAST,NB_3LAST
0,715647,0,9.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.0,0.0,9.0,3
1,818451,0,11.152542,0.99678,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1.0,1.0,1.0,1.0,1.0,0.0,0.0,Street,0.009259,0.240741,12.0,3
2,818455,0,10.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.009259,0.546296,10.0,3
3,813658,0,11.152542,0.99678,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.0,0.057143,12.0,3
4,716312,0,10.932203,1.127469,0.0,0.0,0.0,0.0,PHYSIQUE,25-45,1.0,1.0,1.0,1.0,0.0,0.0,0.0,Street,0.0,0.027523,13.0,3


In [68]:
def save_data(data_filepath, data):
    """
    Sauvegarde le dataset traité pour les modèles dans le répertoire 'data_filepath'
    """
    data.to_csv(os.path.join(data_filepath, 'data_modeles.csv'), index=False)
    print('data saved !')
    
save_data(PATH_DATA, data)

data saved !
