#### Importation des librairies

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from imblearn.over_sampling import SMOTE
import plotly.express as px

#### Importation de la données 

In [5]:
data = pd.read_csv("data/data.csv")

In [6]:
data["job_type"].value_counts()

Self employed                   6437
Informally employed             5597
Farming and Fishing             5441
Remittance Dependent            2527
Other Income                    1080
Formally employed Private       1055
No Income                        627
Formally employed Government     387
Government Dependent             247
Dont Know/Refuse to answer       126
Name: job_type, dtype: int64

#### Traduction des colonnes 

In [7]:
# Je renomme mes colonnes
data = data.rename(columns={'country': 'pays',
                            'year': 'annee',
                            'uniqueid': "unique_id",
                            'bank_account':'compte_bancaire',
                            'location_type': 'type_de_location', 
                            'cellphone_access': 'acces_au_telephone', 
                            'household_size': 'taille_du_menage', 
                            'age_of_respondent': 'age',
                            'gender_of_respondent': 'sexe',
                            'relationship_with_head':'relation_avec_le_chef_de_famille',
                            'marital_status':'etat_civil', 
                            'education_level':'niveau_education',
                            'job_type':'type_de_job', 
                            })

In [8]:
data.head()

Unnamed: 0,pays,annee,unique_id,compte_bancaire,type_de_location,acces_au_telephone,taille_du_menage,age,sexe,relation_avec_le_chef_de_famille,etat_civil,niveau_education,type_de_job
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


#### Création d'une colonne catégorielle "Group Age"

In [9]:
data['Group_Age'] = pd.cut(
    data['age'],
    bins = [10, 20, 30, 40, 50, 60, 70, 80, np.inf], 
    labels = [ '[10-20]','[20-30]','[30-40]','[40-50]','[50-60]','[60-70]','[70-80]','[80+]']
)

#### Classification des types de colonnes dans des listes

In [10]:
# Classement des colonnes dans des listes.
drop        = ["unique_id", "sexe"]
passthrough = ["taille_du_menage", "age", "niveau_education"]
cat         = ["pays","Group_Age", "etat_civil", "type_de_job", "relation_avec_le_chef_de_famille", "annee"]
text        = ['type_de_location','acces_au_telephone', 'compte_bancaire']

In [11]:
# Liste contenant toutes les colonnes
all_cols = [
    drop,
    passthrough,
    text,
    cat,
]

#### Vérification de ne pas avoir négligé de colonnes

In [12]:
# Vérification (fautes ortographes, oublie de colonnes etc...)
def check_work(df, liste_all_listes):
    set_nos_cols = set()
    for liste in liste_all_listes:
        for col in liste:
            if col in set_nos_cols:
                print(f"Warning : La colonne '{col}' est déja présente !")
            set_nos_cols.add(col)
    set_colonnes_originales = set(df.columns)
    mal_écrites = set_nos_cols - set_colonnes_originales
    col_manquantes = set_colonnes_originales - set_nos_cols
    print(f"Des colonnes sont manquantes : {col_manquantes}")
    print(f"Des colonnes sont mal écrites : {mal_écrites}")

In [13]:
check_work(df=data, liste_all_listes=all_cols)

Des colonnes sont manquantes : set()
Des colonnes sont mal écrites : set()


#### Encodage des colonnes "type de location" & "accès au téléphone" & "compte bancaire" (LabelEncoder).

In [14]:
def ordinal_encode(data, columns) : 
    encoder=OrdinalEncoder()
    for i, col in enumerate(columns) :
        data[col]=encoder.fit_transform(data[[col]])
    return data

#### Encodage des colonnes catégorielles (One Hot Encoder).

In [15]:
def one_hot_encode(data, columns):
    one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    df_encoder = pd.DataFrame(one_hot_encoder.fit_transform(data), columns=one_hot_encoder.get_feature_names_out(input_features=data.columns))
    return df_encoder

#### Traitement de la colonne "education_level"

In [16]:
def education_level(X_transform):
    encoder=OrdinalEncoder()
    order = ['Other/Dont know/RTA','No formal education','Primary education','Secondary education','Vocational/Specialised training','Tertiary education']
    encoder = OrdinalEncoder(categories=[order])
    X_transform['education_level_encoded'] = encoder.fit_transform(X_transform[['niveau_education']])
    X_transform = X_transform.drop('niveau_education', axis=1)
    return X_transform

#### Preprocessing

In [17]:
def preprocessing(data, columns_ordinal:list, columns_onehot:list, passthrough:list):
    
    # Création des DataFrames avec les colonnes à encoder     
    df_ordinal = data[[ i for i in columns_ordinal ]]
    df_onehot  = data[[ i for i in columns_onehot  ]]
    
    # Ordinal Encoding des colonnes.
    df_ordinal_encoded = ordinal_encode(data=df_ordinal, columns=columns_ordinal)
    
    # One Hot Encoding des colonnes.
    df_one_hot_encoded = one_hot_encode(data=df_onehot, columns=columns_onehot)
    
    # Concaténation des DataFrames contenant les colonnes encodées et les autres colonnes (passthrough).     
    data_transform = pd.concat([df_ordinal_encoded, df_one_hot_encoded, data[passthrough]], axis=1) 
    
    # Traitement de la colonne Education level.   
    data_transform = education_level(data_transform)
    
    # On retourne les données.     
    return data_transform

#### Exportation des données

In [18]:
clean_data = preprocessing(
    
    # features     
    data=data, 
    
    # columns to encoding with OrdinalEncoder     
    columns_ordinal=text, 
    
    # columns to encoding with OneHotEncoder     
    columns_onehot=cat, 
    
    # Others columns     
    passthrough=passthrough
)

#### Equilibrage des données

In [19]:
def balanced_data(clean_data):
    X = clean_data.drop("compte_bancaire", axis="columns")
    Y = clean_data["compte_bancaire"]
    smote = SMOTE(random_state=42, sampling_strategy="auto")
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name="compte_bancaire")], axis=1)
    balanced_data["education_level_encoded"] = balanced_data["education_level_encoded"].round()
    return balanced_data

In [20]:
balanced_data = balanced_data(clean_data)

#### Export des données

In [22]:
balanced_data.to_csv('data/preprocess_data.csv', index=False)