In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import call
import pandas as pd
import numpy as np
import os

In [2]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

# For drawing the graph
from sklearn.tree import export_graphviz

In [3]:
load_dotenv()

True

In [4]:
# clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ38.csv', encoding='ISO-8859-1', sep='\t')
clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TR35.csv', encoding='ISO-8859-1', sep='\t', low_memory=False)
ctr = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ7S.csv', encoding='ISO-8859-1', sep='\t', usecols=['COCO', 'COMAX', 'COPRO'], low_memory=False)

eco = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'eco-products.csv', encoding='ISO-8859-1', sep='\t')

## Get the best ecological product

In [5]:
# Get the list of all the eco products code
eco = [str(i).zfill(5) for i in eco['COPRO']]

# Declare the list of products for each family
# Les familles sont: CREDITS, DEPOTS MONETAIRES, PRODUITS EXTERNES COMMERCIALISES
credit = ['568', '943', '942', '546', '547', '3992', '940', '941', '548', '6458', '509', '565']
depot = ['7648', '7649', '7606', '3']
comm = ['5808', '5807']

# ECO: 00003 | 242568 - LIVRET DEVELOPPEMENT DURABLE ET SOLIDAIRE
# DEPOTS MONETAIRES - EPARGNE MONETAIRE LIQUIDE - LIVRETS REGLEMENTES
# ctr[ctr['COPRO'].isin(eco)]['COPRO'].value_counts()

## Concatinate with other tables TJ39 - TJDR - TJER

In [6]:
tj39 = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj39.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MSMENC', 'MTPATR', 'MTVAOP'])
tjdr = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjdr.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MTRVIM', 'MTRVFR', 'QTPAFI', 'COHAVI'])
tjer = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjer.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MTAPJE', 'MTPJE'])

In [7]:
# pour le cas de la table tj39, la variable COMAX elle est de 13 caracteres
# or celle des autres tables sont de 10, donc il fallait convertir de 13 a 10
# on prend les 10 premiers caractères du COMAX de tj39, d'apres le data engineer
# de la PBS.
tj39['COMAX'] = [val[:10] for val in tj39['COMAX'].values]

In [8]:
# This table is only good fot the moral clients only
data = pd.merge(clt,tj39,on='COMAX',how='left')
data = pd.merge(data,tjer,on='COMAX',how='left')
data = pd.merge(data,tjdr,on='COMAX',how='left')

In [9]:
data.shape

(1689052, 120)

In [10]:
data = data.drop_duplicates(subset="COMAX")

In [11]:
data['CTSCPI'] = data['CTSCPI'].astype(str)
data['COPOST'] = data['COPOST'].astype(str)

data['COPOST'].replace('nan', np.nan, inplace=True)
data['CTSCPI'].replace('nan', np.nan, inplace=True)

data.dropna(subset=['COPOST'], inplace=True)
data.dropna(subset=['CTSCPI'], inplace=True)

In [11]:
# replace Male with 0, et F avec 1
data['COSEXE'].replace('M', 0, inplace=True)
data['COSEXE'].replace('F', 1, inplace=True)

In [12]:
# get the ids of the products
ids = ctr[ctr['COPRO'].isin(eco)]['COMAX'].values
data['ECO'] = data['COMAX'].isin(ids)

In [13]:
# these values are based int but detected as object
why = ['CTCOPO', 'CTMENB', 'CTSIFA', 'COPOAG', 'CTSC90', 'CTSC91', 'CTSC92', 'CTFORT','PSGPAR',
'CEBPF1', 'CEBPF2', 'CEBPF3', 'CEBPF4', 'CEBPF5', 'CEBPF6', 'CEBPF7', 'CTBP']

for i in why:
    for k in [j for j in data[i].unique() if j.strip() == '']:
         data[i].replace(k, np.nan, inplace=True)

In [14]:
to_del = {'QCBPFA', 'COPOST', 'COGRRB', 'COESPF', 'QCLDD', 'MTELDD', 'COPOAG', 'QCCONT', 'QCLIVR', 'QCLIVJ', 'QCCSL', 'QCLEP', 'MTELDD', 'MTELIJ'}
cols = list(data.columns.values[1:-1])

for i in list(to_del):
    cols.remove(i)

for l in cols:
    try:
        data[l] = data[l].fillna(data[l].median())
    except:
        cols.remove(l)

for l in cols:
    if data[l].isna().sum() != 0:
        data[l] = data[l].fillna(data[l].median())

In [15]:
'''for i in cols:
    print(i, data[i].isna().sum())'''
# data['CTMENB'] = data[''].fillna(data[''].median())

'for i in cols:\n    print(i, data[i].isna().sum())'

In [16]:
len(cols)

103

In [25]:
%store data
%store cols
%store eco

Stored 'data' (DataFrame)
Stored 'cols' (list)
Stored 'eco' (list)


In [18]:
def random_forest(data, ecos, cols):   
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 300)

    # Train the model on training data
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))    

In [19]:
def knn(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets, random_state = 42
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    model = KNeighborsClassifier(n_neighbors=3)
    
    # Train the model on training data
    model.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [20]:
def log_reg(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    logisticRegr = LogisticRegression()
    
    # Train the model on training data
    logisticRegr.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = logisticRegr.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [21]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[cols])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [86]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 300)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

2347.960119009018
11.459137678146362
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69134
        True       0.79      0.74      0.76     32145

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg       0.85      0.86      0.85    101279



In [88]:
# CREDITS
print('CREDITS')
random_forest(data, credit, cols)

# DEPOTS MONETAIRES
print('DEPOTS MONETAIRES')
random_forest(data, depot, cols)

# PRODUITS EXTERNES COMMERCIALISES
print('PRODUITS EXTERNES COMMERCIALISES')
random_forest(data, comm, cols)

CREDITS
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69279
        True       0.79      0.74      0.77     32000

    accuracy                           0.86    101279
   macro avg       0.84      0.83      0.83    101279
weighted avg       0.85      0.86      0.86    101279

DEPOTS MONETAIRES
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69200
        True       0.79      0.74      0.77     32079

    accuracy                           0.86    101279
   macro avg       0.84      0.83      0.83    101279
weighted avg       0.85      0.86      0.86    101279

PRODUITS EXTERNES COMMERCIALISES
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69007
        True       0.79      0.74      0.77     32272

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg   

# - - - - - - - -- - - - - - - - - - - - - - - - -- - - - - - 

In [77]:
def plot_feature_importance(importance,names,model_type):
    #Create arrays from feature importance and feature name logistic regressions
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + '-FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
    plt.savefig('{}.svg'.format(model_type))

In [None]:
plot_feature_importance(rf.feature_importances_[:30],cols[:30],'Random-Forest')

In [None]:
for lol in sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(lol)

In [22]:
matrix = data.corr().values

In [None]:
for i in range(len(cols) - 1):
    for j in range(i+1, len(cols)):
        print('{} -- {}:: {}'.format(cols[i], cols[j], matrix[i][j]*100))

In [35]:
print(data.groupby('ECO')['COSEXE'].value_counts())
print('\n')
print(data.groupby('ECO')['COSEXE'].value_counts(normalize=True))

ECO    COSEXE
False  1         179900
       0         174767
True   1          85488
       0          76045
Name: COSEXE, dtype: int64


ECO    COSEXE
False  1         0.507236
       0         0.492764
True   1         0.529229
       0         0.470771
Name: COSEXE, dtype: float64


In [56]:
# 'QTAGCL'
# lol = ['COSEXE', 'QTAGCL', 'CTSCPI']
def f(col, bins=None):
    print(data.groupby('ECO')[col].value_counts(bins=bins))
    print('\n')
    print(data.groupby('ECO')[col].value_counts(normalize=True, bins=bins))

In [57]:
f('COSEXE')

ECO    COSEXE
False  1         179900
       0         174767
True   1          85488
       0          76045
Name: COSEXE, dtype: int64


ECO    COSEXE
False  1         0.507236
       0         0.492764
True   1         0.529229
       0         0.470771
Name: COSEXE, dtype: float64


In [58]:
f('QTAGCL', 5)

ECO                 
False  (-0.11, 21.8]    121935
       (21.8, 43.6]      99298
       (43.6, 65.4]      84965
       (65.4, 87.2]      43453
       (87.2, 109.0]      5016
True   (43.8, 65.2]      67922
       (22.4, 43.8]      43919
       (65.2, 86.6]      41604
       (86.6, 108.0]      4508
       (0.892, 22.4]      3580
Name: QTAGCL, dtype: int64


ECO                 
False  (-0.11, 21.8]    0.343801
       (21.8, 43.6]     0.279975
       (43.6, 65.4]     0.239563
       (65.4, 87.2]     0.122518
       (87.2, 109.0]    0.014143
True   (43.8, 65.2]     0.420484
       (22.4, 43.8]     0.271889
       (65.2, 86.6]     0.257557
       (86.6, 108.0]    0.027908
       (0.892, 22.4]    0.022163
Name: QTAGCL, dtype: float64


In [59]:
f('CTSCPI')

ECO    CTSCPI
False  8400.0    105406
       8500.0     41821
       7600.0     25360
       5400.0     16953
       5200.0     16100
                  ...  
True   4400.0        17
       0.0            8
       8300.0         5
       6100.0         1
       8200.0         1
Name: CTSCPI, Length: 101, dtype: int64


ECO    CTSCPI
False  8400.0    0.297197
       8500.0    0.117916
       7600.0    0.071504
       5400.0    0.047800
       5200.0    0.045395
                   ...   
True   4400.0    0.000105
       0.0       0.000050
       8300.0    0.000031
       6100.0    0.000006
       8200.0    0.000006
Name: CTSCPI, Length: 101, dtype: float64


In [61]:
f('ECO')

ECO    ECO  
False  False    354667
True   True     161533
Name: ECO, dtype: int64


ECO    ECO  
False  False    1.0
True   True     1.0
Name: ECO, dtype: float64


In [63]:
print(data['ECO'].value_counts())
print('\n')
print(data['ECO'].value_counts(normalize=True))

False    354667
True     161533
Name: ECO, dtype: int64


False    0.687073
True     0.312927
Name: ECO, dtype: float64
