In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import call
import pandas as pd
import numpy as np
import os

In [2]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

# For drawing the graph
from sklearn.tree import export_graphviz

In [3]:
load_dotenv()

True

In [4]:
# clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ38.csv', encoding='ISO-8859-1', sep='\t')
clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TR35.csv', encoding='ISO-8859-1', sep='\t', low_memory=False)
ctr = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ7S.csv', encoding='ISO-8859-1', sep='\t', usecols=['COCO', 'COMAX', 'COPRO'], low_memory=False)

eco = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'eco-products.csv', encoding='ISO-8859-1', sep='\t')

## Get the best ecological product

In [5]:
# Get the list of all the eco products code
eco = [str(i).zfill(5) for i in eco['COPRO']]

# Declare the list of products for each family
# Les familles sont: CREDITS, DEPOTS MONETAIRES, PRODUITS EXTERNES COMMERCIALISES
credit = ['568', '943', '942', '546', '547', '3992', '940', '941', '548', '6458', '509', '565']
depot = ['7648', '7649', '7606', '3']
comm = ['5808', '5807']

# ECO: 00003 | 242568 - LIVRET DEVELOPPEMENT DURABLE ET SOLIDAIRE
# DEPOTS MONETAIRES - EPARGNE MONETAIRE LIQUIDE - LIVRETS REGLEMENTES
# ctr[ctr['COPRO'].isin(eco)]['COPRO'].value_counts()

## Concatinate with other tables TJ39 - TJDR - TJER

In [8]:
tj39 = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj39.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MSMENC', 'MTPATR', 'MTVAOP'])
tjdr = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjdr.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MTRVIM', 'MTRVFR', 'QTPAFI', 'COHAVI'])
tjer = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjer.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MTAPJE', 'MTPJE'])

In [9]:
# pour le cas de la table tj39, la variable COMAX elle est de 13 caracteres
# or celle des autres tables sont de 10, donc il fallait convertir de 13 a 10
# on prend les 10 premiers caractères du COMAX de tj39, d'apres le data engineer
# de la PBS.
tj39['COMAX'] = [val[:10] for val in tj39['COMAX'].values]

In [10]:
# This table is only good fot the moral clients only
data = pd.merge(clt,tj39,on='COMAX',how='left')
data = pd.merge(data,tjer,on='COMAX',how='left')
data = pd.merge(data,tjdr,on='COMAX',how='left')

In [11]:
data.shape

(1689052, 120)

In [12]:
data = data.drop_duplicates(subset="COMAX")

In [13]:
data['CTSCPI'] = data['CTSCPI'].astype(str)
data['COPOST'] = data['COPOST'].astype(str)

data['COPOST'].replace('nan', np.nan, inplace=True)
data['CTSCPI'].replace('nan', np.nan, inplace=True)

data.dropna(subset=['COPOST'], inplace=True)
data.dropna(subset=['CTSCPI'], inplace=True)

In [14]:
# replace Male with 0, et F avec 1
data['COSEXE'].replace('M', 0, inplace=True)
data['COSEXE'].replace('F', 1, inplace=True)

In [15]:
# get the ids of the products
ids = ctr[ctr['COPRO'].isin(eco)]['COMAX'].values
data['ECO'] = data['COMAX'].isin(ids)

In [16]:
# these values are based int but detected as object
why = ['CTCOPO', 'CTMENB', 'CTSIFA', 'COPOAG', 'CTSC90', 'CTSC91', 'CTSC92', 'CTFORT','PSGPAR',
'CEBPF1', 'CEBPF2', 'CEBPF3', 'CEBPF4', 'CEBPF5', 'CEBPF6', 'CEBPF7', 'CTBP']

for i in why:
    for k in [j for j in data[i].unique() if j.strip() == '']:
         data[i].replace(k, np.nan, inplace=True)

In [17]:
to_del = {'COHAVI', 'CERCPT', 'CERCPE', 'CERCEP', 'CERCPS', 'CERCPL', 'CERCPC', 'CERCPP', 'QCBPFA', 'COPOST', 'COGRRB', 'COESPF', 'QCLDD', 'MTELDD', 'COPOAG', 'QCCONT', 'QCLIVR', 'QCLIVJ', 'QCCSL', 'QCLEP', 'MTELDD', 'MTELIJ'}
cols = list(data.columns.values[1:-1])

for i in list(to_del):
    cols.remove(i)

for l in cols:
    try:
        data[l] = data[l].fillna(data[l].median())
    except:
        cols.remove(l)

for l in cols:
    if data[l].isna().sum() != 0:
        data[l] = data[l].fillna(data[l].median())

In [18]:
for i in cols:
    if data[i].isna().sum() != 0:
        print(i)
# data['CTMENB'] = data[''].fillna(data[''].median())

In [19]:
len(cols)

95

In [20]:
%store data
%store cols
%store eco

Stored 'data' (DataFrame)
Stored 'cols' (list)
Stored 'eco' (list)


In [21]:
def random_forest(data, ecos, cols):   
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 300)

    # Train the model on training data
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))    

In [22]:
def knn(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets, random_state = 42
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    model = KNeighborsClassifier(n_neighbors=3)
    
    # Train the model on training data
    model.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [23]:
def log_reg(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    logisticRegr = LogisticRegression()
    
    # Train the model on training data
    logisticRegr.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = logisticRegr.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [23]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[cols])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [86]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 300)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

2347.960119009018
11.459137678146362
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69134
        True       0.79      0.74      0.76     32145

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg       0.85      0.86      0.85    101279



In [88]:
# CREDITS
print('CREDITS')
random_forest(data, credit, cols)

# DEPOTS MONETAIRES
print('DEPOTS MONETAIRES')
random_forest(data, depot, cols)

# PRODUITS EXTERNES COMMERCIALISES
print('PRODUITS EXTERNES COMMERCIALISES')
random_forest(data, comm, cols)

CREDITS
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69279
        True       0.79      0.74      0.77     32000

    accuracy                           0.86    101279
   macro avg       0.84      0.83      0.83    101279
weighted avg       0.85      0.86      0.86    101279

DEPOTS MONETAIRES
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69200
        True       0.79      0.74      0.77     32079

    accuracy                           0.86    101279
   macro avg       0.84      0.83      0.83    101279
weighted avg       0.85      0.86      0.86    101279

PRODUITS EXTERNES COMMERCIALISES
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69007
        True       0.79      0.74      0.77     32272

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg   

# Importance Graph

In [77]:
def plot_feature_importance(importance,names,model_type):
    #Create arrays from feature importance and feature name logistic regressions
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + '-FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
    plt.savefig('{}.svg'.format(model_type))

In [None]:
plot_feature_importance(rf.feature_importances_[:30],cols[:30],'Random-Forest')

In [None]:
for lol in sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(lol)

# Statistics

In [24]:
data.shape

(506392, 121)

In [40]:
print(data['ECO'].value_counts())

False    346365
True     160027
Name: ECO, dtype: int64


In [41]:
print(data['COSEXE'].value_counts())

1    258539
0    247853
Name: COSEXE, dtype: int64


In [43]:
print('0-17 ::', data[(0 <= data['QTAGCL']) & (data['QTAGCL'] < 18)]['COSEXE'].count())
print('18-39 ::', data[(18 <= data['QTAGCL']) & (data['QTAGCL'] < 40)]['COSEXE'].count())
print('40-64 ::', data[(40 <= data['QTAGCL']) & (data['QTAGCL'] < 65)]['COSEXE'].count())
print('65-~ ::', data[65 <= data['QTAGCL']]['COSEXE'].count())

0-17 :: 98172
18-39 :: 135327
40-64 :: 173403
65-~ :: 99490


In [47]:
# lol = ['COSEXE', 'QTAGCL', 'CTSCPI']
print(data.groupby('ECO')['COSEXE'].value_counts())

ECO    COSEXE
False  1         174093
       0         172272
True   1          84446
       0          75581
Name: COSEXE, dtype: int64


In [57]:
print('0-17 ::\n', data[(0 <= data['QTAGCL']) & (data['QTAGCL'] < 18)].groupby('ECO')['COSEXE'].count(), '\n')
print('18-39 ::\n', data[(18 <= data['QTAGCL']) & (data['QTAGCL'] < 40)].groupby('ECO')['COSEXE'].count(), '\n')
print('40-64 ::\n', data[(40 <= data['QTAGCL']) & (data['QTAGCL'] < 65)].groupby('ECO')['COSEXE'].count(), '\n')
print('65-~ ::\n', data[65 <= data['QTAGCL']].groupby('ECO')['COSEXE'].count(), '\n')

0-17 ::
 ECO
False    97505
True       667
Name: COSEXE, dtype: int64 

18-39 ::
 ECO
False    100192
True      35135
Name: COSEXE, dtype: int64 

40-64 ::
 ECO
False    98117
True     75286
Name: COSEXE, dtype: int64 

65-~ ::
 ECO
False    50551
True     48939
Name: COSEXE, dtype: int64 



In [64]:
data.head()

Unnamed: 0,COMAX,CTCOPO,COESPF,COGRRB,COSGPA,CTMENB,COCINS,COPOST,CTSIFA,CTSCPI,...,MSMENC,MTPATR,MTVAOP,MTAPJE,MTPJE,COHAVI,MTRVIM,MTRVFR,QTPAFI,ECO
0,39e4b5c00a,582,d10869c66b,194acf0904,3200,4,34172,34000.0,1,5500.0,...,0.0,150000.0,150.0,0.0,4000.0,1.0,0.0,100.0,1.0,False
1,77cedb77d9,527,c3ea678087,fd7c332a52,3200,7,66136,66000.0,2,6700.0,...,0.0,150000.0,150.0,0.0,4000.0,1.0,0.0,100.0,1.0,False
2,cfb9f1d41b,542,76b274b0ad,e9a0dda18b,3200,7,66021,66430.0,5,4700.0,...,0.0,150000.0,150.0,0.0,4000.0,1.0,0.0,100.0,1.0,True
3,3a738f58cc,542,cb74f7d1d3,7044fd434c,3200,7,66148,66660.0,5,4200.0,...,0.0,300000.0,300.0,0.0,4000.0,1.0,0.0,100.0,1.0,True
15,57fdb1b811,582,7db43147e8,85bda13ea9,3200,1,66050,66530.0,1,4300.0,...,0.0,150000.0,150.0,0.0,4000.0,1.0,0.0,100.0,1.0,False


## Save it

In [82]:
ninja = ctr[ctr['COPRO'].isin(eco)][['COMAX', 'COPRO']]

In [86]:
ninja = ninja[ninja['COMAX'].isin(data[data['ECO'] == True]['COMAX'])]

In [87]:
ninja.shape

(215419, 2)

In [95]:
ninja = list(zip(ninja['COPRO'].value_counts().index, ninja['COPRO'].value_counts().values))

In [107]:
for ko in ninja:
    if ko[0] in credit:
        print('CREDIT::', ko[0], ',', ko[1])
    if ko[0] in depot:
        print('DEPOT::', ko[0], ',', ko[1])
    if ko[0] in comm:
        print('COMM::', ko[0], ',', ko[1])

DEPOT:: 00003 , 201998
CREDIT:: 00546 , 4115
CREDIT:: 00548 , 3598
DEPOT:: 07649 , 2321
CREDIT:: 00940 , 1734
CREDIT:: 00547 , 832
CREDIT:: 00509 , 379
CREDIT:: 00941 , 299
CREDIT:: 03992 , 85
CREDIT:: 06458 , 52
CREDIT:: 00565 , 6
