# Importing the modules

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
import tensorflow.keras as keras
import seaborn as sns

# Importing data

In [162]:
df_client_train = pd.read_csv('./data/dataset/sonelgaz_train/client_train.csv')
df_invoice_train = pd.read_csv('./data/dataset/sonelgaz_train/invoice_train.csv')

df_client_test = pd.read_csv('./data/dataset/sonelgaz_test/client_test.csv')
df_invoice_test = pd.read_csv('./data/dataset/sonelgaz_test/invoice_test.csv')

  df_invoice_train = pd.read_csv('./data/dataset/sonelgaz_train/invoice_train.csv')


# Exploring data

In [3]:
df_client_train.columns, df_invoice_train.columns

(Index(['disrict', 'client_id', 'client_catg', 'region', 'creation_date',
        'target'],
       dtype='object'),
 Index(['client_id', 'invoice_date', 'tarif_type', 'counter_number',
        'counter_statue', 'counter_code', 'reading_remarque',
        'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
        'consommation_level_3', 'consommation_level_4', 'old_index',
        'new_index', 'months_number', 'counter_type'],
       dtype='object'))

In [4]:
def generate_empty_col(df):
    col = dict()

    for id in df['client_id'].values:
        col[id] = np.nan

    return col

In [5]:
def create_col(client, invoice, column, type):
    col = generate_empty_col(client)
    if type == "freq":
        for value, id in zip(invoice.groupby('client_id')[column].unique().map(len), invoice.groupby('client_id')[column].unique().map(len).index):
            col[id] = value

    elif type == "mean":
        for value, id in zip(invoice.groupby('client_id')[column].mean(), invoice.groupby('client_id')[column].mean().index):
            col[id] = value

    elif type == "var":
        for value, id in zip(invoice.groupby('client_id')[column].var(), invoice.groupby('client_id')[column].var().index):
            col[id] = value

    elif type == "std":
        for value, id in zip(invoice.groupby('client_id')[column].std(), invoice.groupby('client_id')[column].std().index):
            col[id] = value

    elif type == "mode":
        result = invoice.groupby('client_id')[column].unique().map(lambda x: x[0] if len(x) == 1 else "BOTH")
        for value, id in zip(result, result.index):
            col[id] = value

    return list(col.values())

In [6]:
def make_elec_gaz_col(client_train):
    ELEC = []
    GAZ = []

    for row in client_train["counter_type"]:
        if row=="ELEC":
            ELEC.append(1)
            GAZ.append(0)
        elif row=="GAZ":
            ELEC.append(0)
            GAZ.append(1)
        else:
            ELEC.append(1)
            GAZ.append(1)

    client_train["ELEC"] = ELEC
    client_train["GAZ"] = GAZ

    client_train.drop(["counter_type"], axis=1, inplace=True)

    return client_train

In [7]:
def generate_data(client, invoice):

    """
    Generate data for the invoice.
    """

    invoice['diff_index'] = invoice['new_index'] - invoice['old_index']

    invoice['mean_consommation'] = (  invoice['consommation_level_1'] + invoice['consommation_level_2'] +  invoice['consommation_level_3'] + invoice['consommation_level_4'] ) / 4

    freq_columns = ['tarif_type', 'counter_number', 'counter_code', 'reading_remarque', 'counter_statue', 'counter_code', 'reading_remarque', ]
    mean_columns = ['counter_coefficient', 'consommation_level_1', 'consommation_level_2', 'consommation_level_3', 'consommation_level_4', 'mean_consommation','old_index', 'new_index', 'diff_index']
    mode_columns = ['counter_type']

    for column in freq_columns:
        print("freq => ", column)
        client[column] = create_col(client, invoice, column, "freq")

    for column in mean_columns:
        print("mean => ", column)

        client[f"{column}_mean"] = create_col(client, invoice, column, "mean")
        client[f"{column}_var"] = create_col(client, invoice, column, "var")
        client[f"{column}_std"] = create_col(client, invoice, column, "std")

    for column in mode_columns:
        print("mode => ", column)
        client[column] = create_col(client, invoice, column, "mode")

    client['diff_mean_index'] = client['new_index_mean'] - client['old_index_mean']

    client = make_elec_gaz_col(client)

    return client, invoice

In [None]:
client_train, invoice_train = generate_data(df_client_train, df_invoice_train)

In [9]:
client_train.drop(["creation_date"], axis=1, inplace=True)

In [10]:
client_train.dropna(inplace=True)

In [None]:
client_train.isnull().sum()

# new dataset

In [14]:
client_train.to_csv('data_train.csv', index = False)

In [65]:
client_train.to_csv('clean_data.csv', index=False)

In [146]:
client_train = pd.read_csv('data_train.csv')

In [147]:
float_cols = client_train.select_dtypes(include=['float'])

client_train[float_cols.columns] = float_cols.round(2)

In [None]:
client_train.columns

In [103]:
drop = ['disrict', 'client_catg', 'region' ]

client_train.drop(drop, axis=1, inplace=True)

In [148]:
encode = ['disrict', 'client_catg', 'region']

for col in encode:
    client_train[col] = client_train[col].astype('str')

encoded_cols = pd.get_dummies(client_train[encode])
client_train = pd.concat([client_train, encoded_cols], axis=1)

client_train.drop(encode, axis=1, inplace=True)

In [None]:
client_train.columns

In [None]:
for col in client_train.columns:
    print(col)
    print(client_train[col].value_counts(), '\n')

In [149]:
normalize = ['counter_coefficient_mean', 'counter_coefficient_var', 'counter_coefficient_std', 'consommation_level_1_mean', 'consommation_level_1_var', 'consommation_level_1_std', 'consommation_level_2_mean', 'consommation_level_2_var', 'consommation_level_2_std', 'consommation_level_3_mean', 'consommation_level_3_var', 'consommation_level_3_std', 'consommation_level_4_mean', 'consommation_level_4_var', 'consommation_level_4_std', 'mean_consommation_mean', 'mean_consommation_var', 'mean_consommation_std', 'old_index_mean', 'old_index_var', 'old_index_std', 'new_index_mean', 'new_index_var', 'new_index_std', 'diff_index_mean', 'diff_index_var', 'diff_index_std', 'diff_mean_index']

for col in normalize:
    client_train[col] = (client_train[col] - client_train[col].mean()) / client_train[col].std()

In [125]:
def clean_data(client_train, categorize=True):

    float_cols = client_train.select_dtypes(include=['float'])
    client_train[float_cols.columns] = float_cols.round(2)

    if categorize:
        encode = ['disrict', 'client_catg', 'region']
        for col in encode:
            client_train[col] = client_train[col].astype('str')
        encoded_cols = pd.get_dummies(client_train[encode])
        client_train = pd.concat([client_train, encoded_cols], axis=1)
        client_train.drop(encode, axis=1, inplace=True)
    else:
        drop = ['disrict', 'client_catg', 'region' ]
        client_train.drop(drop, axis=1, inplace=True)

    normalize = ['counter_coefficient_mean', 'counter_coefficient_var', 'counter_coefficient_std', 'consommation_level_1_mean', 'consommation_level_1_var', 'consommation_level_1_std', 'consommation_level_2_mean', 'consommation_level_2_var', 'consommation_level_2_std', 'consommation_level_3_mean', 'consommation_level_3_var', 'consommation_level_3_std', 'consommation_level_4_mean', 'consommation_level_4_var', 'consommation_level_4_std', 'mean_consommation_mean', 'mean_consommation_var', 'mean_consommation_std', 'old_index_mean', 'old_index_var', 'old_index_std', 'new_index_mean', 'new_index_var', 'new_index_std', 'diff_index_mean', 'diff_index_var', 'diff_index_std', 'diff_mean_index']
    for col in normalize:
        client_train[col] = (client_train[col] - client_train[col].mean()) / client_train[col].std()

    return client_train

# Feature selection & engineering

In [None]:
client_train_1 = client_train[client_train['target'] == 1]
client_train_0 = client_train[client_train['target'] == 0]

for col in client_train_1.select_dtypes(include=['int64', 'float64']):
    plt.hist(client_train_1[col], alpha=0.5, label='Target 1')
    plt.hist(client_train_0[col], alpha=0.5, label='Target 0')
    plt.title(f'Histogram of {col} by target')
    plt.legend()
    plt.show()


In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = client_train.drop(['target', 'client_id'], axis=1)
y = client_train['target']

selector = SelectKBest(score_func=mutual_info_classif, k='all')
selector.fit(X, y)

scores = selector.scores_
feature_names = X.columns

mi_scores = dict(zip(feature_names, scores))

for name, score in sorted(mi_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {score:.2f}")


# Imbalance

In [150]:
ids = client_train['client_id'].values
X_imb = client_train.drop(['target', 'client_id'], axis=1)
y_imb = client_train['target']

In [151]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)

X, y = oversampler.fit_resample(X_imb, y_imb)

In [109]:
y.value_counts()

0.0    123753
1.0    123753
Name: target, dtype: int64

In [158]:
X

Unnamed: 0,tarif_type,counter_number,counter_code,reading_remarque,counter_statue,counter_coefficient_mean,counter_coefficient_var,counter_coefficient_std,consommation_level_1_mean,consommation_level_1_var,...,region_308,region_309,region_310,region_311,region_312,region_313,region_371,region_372,region_379,region_399
0,1,1,2,3,1,-0.007346,-0.00621,-0.010978,-0.147801,-0.016457,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,3,1,-0.007346,-0.00621,-0.010978,0.260214,-0.020024,...,0,0,0,0,0,0,0,0,0,0
2,1,1,2,3,1,-0.007346,-0.00621,-0.010978,0.739692,-0.005987,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,2,1,-0.007346,-0.00621,-0.010978,-0.846322,-0.022469,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,2,1,-0.007346,-0.00621,-0.010978,0.471382,-0.019314,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247501,2,2,2,3,2,-0.007346,-0.00621,-0.010978,0.229624,-0.006464,...,0,0,0,0,0,0,0,0,0,0
247502,2,3,2,3,3,-0.007346,-0.00621,-0.010978,-0.156791,-0.015914,...,0,0,0,1,0,0,0,0,0,0
247503,2,3,2,3,1,-0.007346,-0.00621,-0.010978,0.151120,-0.011852,...,0,0,0,1,0,0,0,0,0,0
247504,2,3,2,3,2,-0.007346,-0.00621,-0.010978,-0.313680,-0.018668,...,0,0,0,0,0,0,0,0,0,0


# PCA 

In [211]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)

X_pca = pca.fit_transform(X)

# XGBoost

In [212]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

clf = XGBClassifier(objective='binary:logistic', eval_metric='auc', seed=42)

params = {
    'learning_rate': [0.15],
    'max_depth': [20],
    'n_estimators': [400]
}

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=params, scoring='roc_auc', cv=cv, verbose=2)

grid_search.fit(X_pca, y)

print("Best parameters: ", grid_search.best_params_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] END .learning_rate=0.15, max_depth=20, n_estimators=400; total time= 4.9min
[CV] END .learning_rate=0.15, max_depth=20, n_estimators=400; total time= 4.8min
[CV] END .learning_rate=0.15, max_depth=20, n_estimators=400; total time= 5.0min
[CV] END .learning_rate=0.15, max_depth=20, n_estimators=400; total time= 4.8min
Best parameters:  {'learning_rate': 0.15, 'max_depth': 20, 'n_estimators': 400}


In [213]:
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'learning_rate': 0.15, 'max_depth': 20, 'n_estimators': 400}


In [215]:
y_pred = grid_search.predict(X_pca)
y_probas = grid_search.predict_proba(X_pca)[:, 1]

roc_auc = roc_auc_score(y, y_probas)
accuracy = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred)

print("roc_auc: ", roc_auc)
print("accuracy: ", accuracy)
print("f1_score: ", f1)

roc_auc:  1.0
accuracy:  1.0
f1_score:  1.0


In [216]:
xgb_model = grid_search.best_estimator_

# DL

In [182]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
from keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

In [225]:
model = Sequential()

model.add(BatchNormalization())
model.add(Dense(92, input_dim=X_pca.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [135]:
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath='./model_checkpoint/model',
    save_weights_only=True,
    save_best_only=False,
    save_freq='epoch')

checkpoint = ModelCheckpoint('model_weights.h5', save_weights_only=True, save_freq='epoch')

In [226]:
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=[AUC(name='aucroc')])

early_stopping = EarlyStopping(monitor='aucroc', mode='max', patience=4, verbose=1)

  super().__init__(name, **kwargs)


In [None]:
model.fit(X_pca, y, epochs=20, batch_size=64, callbacks=[early_stopping, checkpoint_callback])

In [229]:
test_loss, test_aucroc = model.evaluate(X_pca, y)
print('Test AUC-ROC:', test_aucroc)

Test AUC-ROC: 0.90036940574646


# Test data

In [None]:
client_test, invoice_test = generate_data(df_client_test, df_invoice_test)

In [166]:
client_test.fillna(client_test.mean(), inplace=True)

  client_test.fillna(client_test.mean(), inplace=True)


In [167]:
client_test

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,tarif_type,counter_number,counter_code,reading_remarque,counter_statue,...,old_index_std,new_index_mean,new_index_var,new_index_std,diff_index_mean,diff_index_var,diff_index_std,diff_mean_index,ELEC,GAZ
0,62,test_Client_0,11,307,28/05/2002,1,1,1,3,1,...,5401.043096,11881.216216,2.975780e+07,5455.070893,491.378378,5.554735e+04,235.684859,491.378378,1,0
1,69,test_Client_1,11,103,06/08/2009,1,1,1,3,2,...,17638.422718,32083.954545,3.116216e+08,17652.806615,2703.181818,1.024544e+07,3200.849986,2703.181818,1,0
2,62,test_Client_10,11,310,07/04/2004,2,2,2,3,1,...,10987.377541,12381.364865,1.276873e+08,11299.881222,607.310811,1.787428e+05,422.779872,607.310811,1,1
3,60,test_Client_100,11,101,08/10/1992,2,2,2,3,1,...,13242.361986,13868.075000,1.807456e+08,13444.166323,244.350000,6.113413e+04,247.253171,244.350000,1,1
4,62,test_Client_1000,11,301,21/07/1977,2,3,2,3,2,...,14371.199312,14676.584906,2.087575e+08,14448.442049,749.113208,7.470055e+05,864.294814,749.113208,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58064,63,test_Client_9995,11,399,17/03/2010,2,2,2,1,1,...,11969.869545,10822.500000,1.563153e+08,12502.612354,461.250000,2.838289e+05,532.755964,461.250000,1,1
58065,63,test_Client_9996,11,311,28/05/2011,2,3,2,3,3,...,1616.041631,2099.173913,2.784451e+06,1668.667337,181.565217,1.156847e+04,107.556838,181.565217,1,1
58066,60,test_Client_9997,11,101,04/03/1978,2,3,2,3,1,...,9270.282678,12132.220339,8.590230e+07,9268.349198,172.491525,1.368598e+04,116.987086,172.491525,1,1
58067,60,test_Client_9998,11,101,23/02/2018,1,1,1,1,1,...,8056.220038,1927.000000,2.908807e+08,8191.814912,1927.000000,2.941785e+07,809.859108,1927.000000,1,0


In [168]:
client_test = clean_data(client_test, True)

In [169]:
client_test.drop(["creation_date"], axis=1, inplace=True)

In [170]:
ids_test = client_test['client_id'].values
X_test = client_test.drop(['client_id'], axis=1)

# Predict

In [218]:
X_test_pca = pca.transform(X_test)

In [230]:
# DL
predictions_dl = model.predict(X_test_pca)



In [231]:
predictions_dl

array([[0.21277757],
       [0.6482666 ],
       [0.09552129],
       ...,
       [0.9439009 ],
       [0.39999476],
       [0.64516926]], dtype=float32)

In [232]:
columns = {
    "client_id": ids_test,
    "target" : [proba[0] for proba in predictions_dl]
}

In [220]:
# ML
predcitions_ml = grid_search.predict_proba(X_test_pca)[:, 1]

In [208]:
predcitions_ml

array([1.4720542e-03, 1.2000856e-01, 3.8787842e-04, ..., 8.6289847e-01,
       3.9754476e-04, 1.5268045e-03], dtype=float32)

In [200]:
columns = {
    "client_id": ids_test,
    "target" :  [ (0.8 * probaML + 0.2 * probaDL[0] ) for probaML, probaDL in zip(predcitions_ml, predictions_dl) ]  
}

In [222]:
submit = pd.DataFrame(columns)

submit.to_csv("submission.csv", index=False)