### Importation des librairies

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings("ignore")

### Lecture des données

In [2]:
PATH_DATA = "/home/erwan/Bureau/Cas_Ekimetrics/data"

#### Dons

In [3]:
def read_dons():
    """Lecture des données dons"""
    path_dons = os.path.join(PATH_DATA, 'dons_reguliers.csv')
    if not os.path.exists(path_dons):
        raise FileNotFoundError("path for dons {} doesn't exist".format(path_dons))

    dons = pd.read_csv(os.path.join(PATH_DATA, 'dons_reguliers.csv'))
    dons_cols = ['DATE_DE_RECEPTION', 'ID_CONTACT', 'CAM_CODE', 'OFT_CODE', 'MONTANT_VENTILATION']
    dons = dons[dons_cols]
    
    nb_client = dons.loc[dons.CAM_CODE=='PA'].ID_CONTACT.nunique()
    print("nombre de clients réguliers : {}".format(nb_client))
    
    return dons

#### Contacts

In [4]:
def read_contacts():
    """Lecture des données contacts"""
    path_contact = os.path.join(PATH_DATA, 'data_contact.csv')
    if not os.path.exists(path_contact):
        raise FileNotFoundError("path for contacts {} doesn't exist".format(path_contact))
    contact = pd.read_csv(path_contact)
    contact_cols = ['ID_CTC', 'TYPE_CONTACT', 'TRANCHE_AGE', 'ADRESSE_POSTALE', 'EMAIL',
                'TELEPHONE', 'OPTIN', 'STOP_TEL', 'STOP_MAILING', 'STOP_GENERAL', 'PA_ACTIF',
                'CYCLE_PA_ACTIF', 'CANAL_ACQUISITION_AGG' ]
    contact = contact[contact_cols]
    contact.CYCLE_PA_ACTIF = contact.CYCLE_PA_ACTIF.fillna('M')
    return contact

#### Mails

In [5]:
def read_mails():
    """Lecture des données mails"""
    path_mail = os.path.join(PATH_DATA, 'stats_mail_reguliers.csv')
    if not os.path.exists(path_mail):
        raise FileNotFoundError("path for mails {} doesn't exist".format(path_mail))
    mail = pd.read_csv(path_mail)
    mail_cols = ['ratio_ouvert', 'ratio_click', 'ID_CONTACT']
    mail = mail[mail_cols]
    mail.rename({'ratio_ouvert' : 'RATIO_OPEN', 'ratio_click' : 'RATIO_CLICK'}, inplace=True, axis=1)
    print("nombre clients mails : {}".format(mail.ID_CONTACT.nunique()))
    return mail

In [6]:
def save_files(train):
    """Enregistrement des datasets de train et de test"""
    path_train = os.path.join(PATH_DATA, 'train_churn.csv')
    
    train.to_csv(path_train, index=False)
    
    print("files saved correctly !")

### Traitement des données
#### Définition du périmètre de client, split train/test et labellisation

In [7]:
def adjust_scope(dons_av, dons_ap, date_seuil):
    """
    Fonction qui permet de splitter les dons en ne gardant que les réguliers à une certaine date
    """
    
    # On calcule la première et dernière date de prélèvement
    date_pa = dons_av.loc[dons_av.CAM_CODE == 'PA'][['ID_CONTACT', 'DATE_DE_RECEPTION']].groupby('ID_CONTACT').agg(['min', 'max'])
    date_pa.rename({'min' : 'FIRST_TRANSACTION', 'max' : 'LAST_TRANSACTION'}, level=1, inplace=True, axis=1)
    date_pa.columns = date_pa.columns.droplevel()
    date_pa['ID_CONTACT'] = date_pa.index
    
    # On filtre en ne gardant que les dons compris entre ces dates
    filtre_don = dons_av.merge(date_pa, how='left', on='ID_CONTACT')
    dons_scope = filtre_don.loc[(filtre_don.FIRST_TRANSACTION <= filtre_don.DATE_DE_RECEPTION) &
                                (filtre_don.DATE_DE_RECEPTION <= filtre_don.LAST_TRANSACTION)]
    
    # On récupère uniquement les clients réguliers de la date_seuil
    list_client_reg = dons_scope.loc[dons_scope.LAST_TRANSACTION == date_seuil].ID_CONTACT.unique()
    dons_scope = dons_scope[dons_av.columns]
    
    # On filtre les dons en ne gardant uniquement les clients réguliers
    dons_scope = dons_scope.loc[dons_scope.ID_CONTACT.isin(list_client_reg)]
    dons_ap = dons_ap.loc[dons_ap.ID_CONTACT.isin(list_client_reg)]
    
    return dons_scope, dons_ap

In [8]:
def split_dons(dons, date_seuil):
    """
    Fonction qui sépare le dataset de dons en deux selon une date seuil
    """
    
    dons_avant = dons.loc[dons.DATE_DE_RECEPTION <= date_seuil]
    dons_apres = dons.loc[dons.DATE_DE_RECEPTION > date_seuil]
    
    nb_client = dons_avant.loc[dons_avant.CAM_CODE == 'PA'].ID_CONTACT.nunique()
    print("nombre de clients avant la date de seuil : {}".format(nb_client))
    
    return dons_avant, dons_apres

In [9]:
def compute_info_date(dons, contacts):
    """
    Fonction qui calcule des informations sur la date et renvoi un dataframe contenant :
    - ID_CONTACT
    - FIRST_TRANSACTION : date de la première transaction
    - LAST_TRANSACTION : date de la dernière transaction
    - DUREE_FIDELE : durée (en mois) entre la première et dernière transaction
    - CYCLE_PA_ACTIF : indique la fréquence des prélèvements ('M', 'T', 'S', 'A')
    """
    
    # calcul des dates du premier et dernier prélèvement
    info_date = dons.loc[dons.CAM_CODE == 'PA'][['ID_CONTACT', 'DATE_DE_RECEPTION']].groupby('ID_CONTACT').agg(['min', 'max'])
    info_date.rename({'min' : 'FIRST_TRANSACTION', 'max' : 'LAST_TRANSACTION'}, level=1, inplace=True, axis=1)
    info_date.columns = info_date.columns.droplevel()
    info_date['ID_CONTACT'] = info_date.index

    # calcul de DUREE_FIDELE : durée (mois) entre le premier et dernier prélèvement
    info_date['DUREE_FIDELE'] = pd.to_datetime(info_date['LAST_TRANSACTION'], format='%Y-%m-%d') - pd.to_datetime(info_date['FIRST_TRANSACTION'], format='%Y-%m-%d')
    info_date['DUREE_FIDELE'] = (info_date['DUREE_FIDELE'] / np.timedelta64(1, 'M')).astype(int)

    # on rajoute l'information CYCLE_PA_ACTIF
    info_date = info_date.merge(contacts[['ID_CTC', 'CYCLE_PA_ACTIF']], 
                               left_on='ID_CONTACT', right_on='ID_CTC', how='left')
    info_date = info_date.drop(['ID_CTC'], axis=1)
    
    return info_date

In [10]:
def compute_churn(info_dates_av, info_dates_ap, date_max, verbose=False):
    """
    Fonction qui labellise la colonne des clients pour savoir s'ils ont churnés en fonction
    de leurs caractéristiques temporelles de prélèvement automatique.
    """
    
    date_tri_max = (pd.to_datetime(date_max, format='%Y-%m-%d') + relativedelta(months=-2)).strftime('%Y-%m-%d')
    date_sem_max = (pd.to_datetime(date_max, format='%Y-%m-%d') + relativedelta(months=-5)).strftime('%Y-%m-%d')
    date_ann_max = (pd.to_datetime(date_max, format='%Y-%m-%d') + relativedelta(years=-1)).strftime('%Y-%m-%d')
    
    def churn(line):
        '''fonction appliquée à chaque ligne du dataset indiquant si le client churn'''
        
        if pd.isnull(line['DERNIERE_TRANSACTION']):
            return 1
        elif line['CYCLE_PA_ACTIF'] == 'M' and line['DERNIERE_TRANSACTION'] < date_max:
            return 1
        elif line['CYCLE_PA_ACTIF'] == 'T' and line['DERNIERE_TRANSACTION'] < date_tri_max:
            return 1
        elif line['CYCLE_PA_ACTIF'] == 'S' and line['DERNIERE_TRANSACTION'] < date_sem_max:
            return 1
        elif line['CYCLE_PA_ACTIF'] == 'A' and line['DERNIERE_TRANSACTION'] < date_ann_max:
            return 1
        else:
            return 0
    
    info_dates_ap = info_dates_ap[['ID_CONTACT', 'LAST_TRANSACTION']]
    info_dates_ap.rename({'LAST_TRANSACTION' : 'DERNIERE_TRANSACTION'}, inplace=True, axis=1)
    info_dates = info_dates_av.merge(info_dates_ap, how='left', on='ID_CONTACT')
    
    # Initialisation du dataframe et calcul de la colonne 'CHURN'
    data = pd.DataFrame({'ID_CONTACT' : info_dates_av.ID_CONTACT,
                         'FIRST_TRANSACTION' : info_dates_av.FIRST_TRANSACTION,
                         'LAST_TRANSACTION' : info_dates_av.LAST_TRANSACTION,
                         'DUREE_FIDELE' : info_dates_av.DUREE_FIDELE
                        })
    data['CHURN'] = info_dates.apply(churn, axis=1)
    
    if verbose:
        nb_churn = len(data.loc[data.CHURN == 1].index)
        nb_no_churn = len(data.loc[data.CHURN == 0].index)
        print("nombre de churn : {}".format(nb_churn))
        print("nombre de fidèles : {}".format(nb_no_churn))
        print("total : {}".format(nb_churn + nb_no_churn))
    
    return data

In [11]:
def compute_test_scope(dons_apres, dons_avant, train):
    """
    Cette fonction recalcule le périmètre du test set et récupère tous les dons depuis le début.
    Les clients sont les suivants :
    - les clients n'ayant pas churnés lors de la période du train set
    - les clients ayant effectués des prélèvements automatiques durant la période du test set (il
        peut s'agir de clients ayant déjà churnés !)
    """
    # Récupération des clients qui n'ont pas churnés
    client_no_churn = train.loc[train.CHURN == 0].ID_CONTACT.unique()
    # Récupération de tous les clients réguliers de 2018
    client_apres = dons_apres.loc[dons_apres.CAM_CODE == 'PA'].ID_CONTACT.unique()
    
    # Aggregation de ces deux périmètes
    client_total = np.concatenate((client_no_churn, client_apres))
    dons_apres = pd.concat([dons_apres.loc[dons_apres.ID_CONTACT.isin(client_total)], 
                            dons_avant.loc[dons_avant.ID_CONTACT.isin(client_total)]
                           ])
    dons_apres = dons_apres.drop_duplicates()
    nb_client = dons_apres.loc[dons_apres.CAM_CODE == 'PA'].ID_CONTACT.nunique()
    print("nb de clients apres la date de seuil : {}".format(nb_client))

    return dons_apres

#### Création de nouvelles features

In [12]:
def nb_signature(dons, data):
    """Ajoute le nombre de signature par client"""
    # Calcul du nombre de signatures
    nb_sign = dons.loc[dons.OFT_CODE == 'SIGN'][['ID_CONTACT', 'OFT_CODE']].groupby('ID_CONTACT').agg('count')
    nb_sign.rename({'OFT_CODE' : 'NB_SIGN'}, inplace=True, axis=1)

    # Fusion des dataframes et remplissage des valeurs manaquantes
    if 'NB_SIGN' not in data.columns:
        data = data.merge(nb_sign, how='left', on='ID_CONTACT')
        data.NB_SIGN = data.NB_SIGN.fillna(0)
    
    return data

In [13]:
def info_type_dons(dons, data):
    """Ajoute le nombre et montant pour chaque type de don par client"""
    # Calcul des valeurs pour chaque type de don
    info_reg = dons[(dons.CAM_CODE == 'PA') & (dons.MONTANT_VENTILATION != 0)][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['sum', 'count', 'std'])
    info_occ = dons[~(dons.CAM_CODE.isin(['EVENT', 'TW', 'WTR', 'PA'])) & (dons.MONTANT_VENTILATION != 0)][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['sum', 'count'])
    info_event = dons[dons.CAM_CODE.isin(['EVENT', 'TW', 'WTR'])][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg(['sum', 'count'])

    # On supprime les niveaux induits par les aggrégations
    info_reg.rename({'sum' : 'SUM_PA', 'count' : 'COUNT_PA', 'std' : 'STD_PA'}, level=1, inplace=True, axis=1)
    info_reg.columns = info_reg.columns.droplevel()

    info_occ.rename({'sum' : 'SUM_OCC', 'count' : 'COUNT_OCC'}, level=1, inplace=True, axis=1)
    info_occ.columns = info_occ.columns.droplevel()

    info_event.rename({'sum' : 'SUM_EVENT', 'count' : 'COUNT_EVENT'}, level=1, inplace=True, axis=1)
    info_event.columns = info_event.columns.droplevel()

    # Fusion des dataframes et remplissage des valeurs manquantes
    if 'SUM_PA' not in data.columns:
        data = data.merge(info_reg, how='left', on='ID_CONTACT')
        data.SUM_PA = data.SUM_PA.fillna(0)
        data.COUNT_PA = data.COUNT_PA.fillna(0)
        data.STD_PA = data.STD_PA.fillna(0)
    if 'SUM_OCC' not in data.columns:
        data = data.merge(info_occ, how='left', on='ID_CONTACT')
        data.SUM_OCC = data.SUM_OCC.fillna(0)
        data.COUNT_OCC = data.COUNT_OCC.fillna(0)
    if 'SUM_EVENT' not in data.columns:
        data = data.merge(info_event, how='left', on='ID_CONTACT')
        data.SUM_EVENT = data.SUM_EVENT.fillna(0)
        data.COUNT_EVENT = data.COUNT_EVENT.fillna(0)
    
    return data

In [14]:
def nb_mvmnt_nuls(dons, data):
    """Ajoute le nombre de montant nuls par client dans la base de donnée"""
    # Calcul du nombre de montant nuls
    nb_nuls = dons.loc[(dons.CAM_CODE == 'PA') & (dons.MONTANT_VENTILATION == 0)][['ID_CONTACT', 'MONTANT_VENTILATION']].groupby('ID_CONTACT').agg('count')
    nb_nuls.rename({'MONTANT_VENTILATION' : 'NB_MVMNT_NULS'}, inplace=True, axis=1)

    # Fusion des dataframes et remplissage des valeurs manaquantes
    if 'NB_MVMNT_NULS' not in data.columns:
        data = data.merge(nb_nuls, how='left', on='ID_CONTACT')
        data.NB_MVMNT_NULS = data.NB_MVMNT_NULS.fillna(0)
        
    return data

In [15]:
def join_info_contact(contact, data):
    """Ajoute les informations utiles contenues dans la table 'contact'"""
    # Jointure avec la base de donnée contact
    if 'TYPE_CONTACT' not in data.columns:
        data = data.merge(contact, how='left', left_on='ID_CONTACT', right_on='ID_CTC')
        data = data.drop(['ID_CTC'], axis=1)

    # Nettoyage des données
    data.TYPE_CONTACT = data.TYPE_CONTACT.fillna('PHYSIQUE')
    data.TRANCHE_AGE = data.TRANCHE_AGE.fillna('NO_INFO')
    data.ADRESSE_POSTALE = data.ADRESSE_POSTALE.fillna(1)
    data.EMAIL = data.EMAIL.fillna(1)
    data.TELEPHONE = data.TELEPHONE.fillna(1)
    data.OPTIN = data.OPTIN.fillna(1)
    data.STOP_TEL = data.STOP_TEL.fillna(0)
    data.STOP_MAILING = data.STOP_MAILING.fillna(0)
    data.STOP_GENERAL = data.STOP_GENERAL.fillna(0)
    mean_PA = data.PA_ACTIF.sum() / data.shape[0]
    data.PA_ACTIF = data.PA_ACTIF.fillna(mean_PA)
    data.CYCLE_PA_ACTIF = data.CYCLE_PA_ACTIF.fillna('M')
    data.CANAL_ACQUISITION_AGG = data.CANAL_ACQUISITION_AGG.fillna('Street')
    
    return data

In [16]:
def join_info_mail(mail, data):
    """Ajoute les informations utiles contenues dans la table 'mail'"""
    # Jointure avec data_mail
    if 'RATIO_OPEN' not in data.columns:
        data = data.merge(mail, how='left', on='ID_CONTACT')

    # Nettoyage des données
    data.RATIO_OPEN = data.RATIO_OPEN.fillna(data.RATIO_OPEN.mean())
    data.RATIO_CLICK = data.RATIO_CLICK.fillna(data.RATIO_CLICK.mean())
    
    return data

In [17]:
def add_features(dons, contact, mail, data):
    """
    Fonction qui ajoute toutes les features décrites par les fonctions précédentes
    """
    
    data = nb_signature(dons, data)
    data = info_type_dons(dons, data)
    data = nb_mvmnt_nuls(dons, data)
    data = join_info_contact(contact, data)
    data = join_info_mail(mail, data)
    
    return data

### Script principal

In [18]:
def generate_dataset(date_seuil, save=False):
    
    # Lecture des données
    dons = read_dons()
    data_contact = read_contacts()
    data_mail = read_mails()
    
    # Ajustement du périmètre et split des dons
    dons_av, dons_ap = split_dons(dons, date_seuil)
    dons_av, dons_ap = adjust_scope(dons_av, dons_ap, date_seuil)
    
    # Calcul des dates de prélèvement 
    pa_dates_av = compute_info_date(dons_av, data_contact)
    pa_dates_ap = compute_info_date(dons_ap, data_contact)
    
    # Labellisation du dataset
    date_max = dons_ap.loc[dons_ap.CAM_CODE == 'PA'].DATE_DE_RECEPTION.max()
    print("date_max : {}".format(date_max))
    train_data = compute_churn(pa_dates_av, pa_dates_ap, date_max, verbose=True)
    
    # Ajout des nouvelles features
    train_data = add_features(dons_av, data_contact, data_mail, train_data)
    
    # Sauvegarde des datasets
    if save:
        save_files(train_data)
    
    return train_data
    

In [22]:
train = generate_dataset('2017-12-10', save=True)

nombre de clients réguliers : 22297
nombre clients mails : 18316
nombre de clients avant la date de seuil : 18196
date_max : 2018-12-10
nombre de churn : 2549
nombre de fidèles : 8685
total : 11234
files saved correctly !


In [20]:
train.head()

Unnamed: 0,ID_CONTACT,FIRST_TRANSACTION,LAST_TRANSACTION,DUREE_FIDELE,CHURN,NB_SIGN,SUM_PA,COUNT_PA,STD_PA,SUM_OCC,COUNT_OCC,SUM_EVENT,COUNT_EVENT,NB_MVMNT_NULS,TYPE_CONTACT,TRANCHE_AGE,ADRESSE_POSTALE,EMAIL,TELEPHONE,OPTIN,STOP_TEL,STOP_MAILING,STOP_GENERAL,PA_ACTIF,CYCLE_PA_ACTIF,CANAL_ACQUISITION_AGG,RATIO_OPEN,RATIO_CLICK
0,55,2014-01-10,2017-12-10,46,0,2.0,282.0,47.0,0.0,120.0,1.0,0.0,0.0,0.0,PHYSIQUE,65+,1,1,1.0,1.0,0.0,0.0,0.0,1.0,M,Courrier,0.121495,0.037383
1,112,2014-01-10,2017-12-10,46,0,3.0,376.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,65+,1,1,1.0,0.0,0.0,0.0,0.0,1.0,M,Divers,0.0,0.0
2,135,2014-01-10,2017-12-10,46,1,0.0,282.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,65+,1,0,0.0,1.0,0.0,0.0,0.0,0.0,M,Courrier,0.225603,0.018155
3,170,2014-01-10,2017-12-10,46,0,4.0,470.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,65+,1,1,1.0,1.0,0.0,0.0,0.0,1.0,M,Courrier,0.057692,0.0
4,204,2014-01-10,2017-12-10,46,0,0.0,305.5,47.0,0.0,0.0,0.0,0.0,0.0,0.0,PHYSIQUE,45-65,1,1,1.0,1.0,0.0,0.0,0.0,1.0,M,Divers,0.378947,0.031579
