In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import call
import pandas as pd
import numpy as np
import time
import os

In [2]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

# For drawing the graph
from sklearn.tree import export_graphviz

In [3]:
load_dotenv()

True

In [4]:
# clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ38.csv', encoding='ISO-8859-1', sep='\t')
clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TR35.csv', encoding='ISO-8859-1', sep='\t', low_memory=False)
ctr = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ7S.csv', encoding='ISO-8859-1', sep='\t', usecols=['COCO', 'COMAX', 'COPRO'], low_memory=False)

eco = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'eco-products.csv', encoding='ISO-8859-1', sep='\t')

## Get the best ecological product

In [5]:
# Get the list of all the eco products code
eco = [str(i).zfill(5) for i in eco['COPRO']]

# Declare the list of products for each family
# Les familles sont: CREDITS, DEPOTS MONETAIRES, PRODUITS EXTERNES COMMERCIALISES
credit = ['00568', '00943', '00942', '00546', '00547', '03992', '00940', '00941', '00548', '06458', '00509', '00565']
depot = ['07648', '07649', '07606', '00003']
comm = ['05808', '05807']

# ECO: 00003 | 242568 - LIVRET DEVELOPPEMENT DURABLE ET SOLIDAIRE
# DEPOTS MONETAIRES - EPARGNE MONETAIRE LIQUIDE - LIVRETS REGLEMENTES
# ctr[ctr['COPRO'].isin(eco)]['COPRO'].value_counts()

## Concatinate with other tables TJ39 - TJDR - TJER

In [6]:
tj39 = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj39.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MSMENC', 'MTPATR', 'MTVAOP'])
tjdr = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjdr.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MTRVIM', 'MTRVFR', 'QTPAFI', 'COHAVI'])
tjer = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjer.csv', encoding='ISO-8859-1', sep=';', usecols=['COMAX', 'MTAPJE', 'MTPJE'])

In [7]:
# pour le cas de la table tj39, la variable COMAX elle est de 13 caracteres
# or celle des autres tables sont de 10, donc il fallait convertir de 13 a 10
# on prend les 10 premiers caractères du COMAX de tj39, d'apres le data engineer
# de la PBS.
tj39['COMAX'] = [val[:10] for val in tj39['COMAX'].values]

In [8]:
# This table is only good fot the moral clients only
data = pd.merge(clt,tj39,on='COMAX',how='left')
data = pd.merge(data,tjer,on='COMAX',how='left')
data = pd.merge(data,tjdr,on='COMAX',how='left')

In [9]:
data.shape

(1689052, 120)

In [10]:
data = data.drop_duplicates(subset="COMAX")

In [11]:
data['CTSCPI'] = data['CTSCPI'].astype(str)
data['COPOST'] = data['COPOST'].astype(str)

data['COPOST'].replace('nan', np.nan, inplace=True)
data['CTSCPI'].replace('nan', np.nan, inplace=True)

data.dropna(subset=['COPOST'], inplace=True)
data.dropna(subset=['CTSCPI'], inplace=True)

In [12]:
# replace Male with 0, et F avec 1
data['COSEXE'].replace('M', 0, inplace=True)
data['COSEXE'].replace('F', 1, inplace=True)

In [13]:
# get the ids of the products
ids = ctr[ctr['COPRO'].isin(eco)]['COMAX'].values
data['ECO'] = data['COMAX'].isin(ids)

In [14]:
# these values are based int but detected as object
why = ['CTCOPO', 'CTMENB', 'CTSIFA', 'COPOAG', 'CTSC90', 'CTSC91', 'CTSC92', 'CTFORT','PSGPAR',
'CEBPF1', 'CEBPF2', 'CEBPF3', 'CEBPF4', 'CEBPF5', 'CEBPF6', 'CEBPF7', 'CTBP']

for i in why:
    for k in [j for j in data[i].unique() if j.strip() == '']:
         data[i].replace(k, np.nan, inplace=True)

In [15]:
to_del = {'COHAVI', 'CERCPT', 'CERCPE', 'CERCEP', 'CERCPS', 'CERCPL', 'CERCPC', 'CERCPP', 'QCBPFA', 'COPOST', 'COGRRB', 'COESPF', 'QCLDD', 'MTELDD', 'COPOAG', 'QCCONT', 'QCLIVR', 'QCLIVJ', 'QCCSL', 'QCLEP', 'MTELDD', 'MTELIJ'}
cols = list(data.columns.values[1:-1])

for i in list(to_del):
    cols.remove(i)

for l in cols:
    try:
        data[l] = data[l].fillna(data[l].median())
    except:
        cols.remove(l)

for l in cols:
    if data[l].isna().sum() != 0:
        data[l] = data[l].fillna(data[l].median())

In [16]:
for i in cols:
    if data[i].isna().sum() != 0:
        print(i)
# data['CTMENB'] = data[''].fillna(data[''].median())

In [17]:
len(cols)

95

In [18]:
%store data
%store cols
%store eco
%store ctr

Stored 'data' (DataFrame)
Stored 'cols' (list)
Stored 'eco' (list)
Stored 'ctr' (DataFrame)


In [21]:
def random_forest(data, ecos, cols):   
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 300)

    # Train the model on training data
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    # Get the best variables
    shit = sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True)
    for i in shit:
        print(i)    

In [22]:
def knn(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets, random_state = 42
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    model = KNeighborsClassifier(n_neighbors=3)
    
    # Train the model on training data
    model.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [23]:
def log_reg(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    logisticRegr = LogisticRegression()
    
    # Train the model on training data
    logisticRegr.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = logisticRegr.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [81]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[cols])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [82]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 1000)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

7644.804353713989
36.22756266593933
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     68956
        True       0.80      0.74      0.77     32323

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg       0.85      0.86      0.85    101279



# Statistics

In [24]:
data.shape

(506392, 121)

In [40]:
print(data['ECO'].value_counts())

False    346365
True     160027
Name: ECO, dtype: int64


In [41]:
print(data['COSEXE'].value_counts())

1    258539
0    247853
Name: COSEXE, dtype: int64


In [43]:
print('0-17 ::', data[(0 <= data['QTAGCL']) & (data['QTAGCL'] < 18)]['COSEXE'].count())
print('18-39 ::', data[(18 <= data['QTAGCL']) & (data['QTAGCL'] < 40)]['COSEXE'].count())
print('40-64 ::', data[(40 <= data['QTAGCL']) & (data['QTAGCL'] < 65)]['COSEXE'].count())
print('65-~ ::', data[65 <= data['QTAGCL']]['COSEXE'].count())

0-17 :: 98172
18-39 :: 135327
40-64 :: 173403
65-~ :: 99490


In [47]:
# lol = ['COSEXE', 'QTAGCL', 'CTSCPI']
print(data.groupby('ECO')['COSEXE'].value_counts())

ECO    COSEXE
False  1         174093
       0         172272
True   1          84446
       0          75581
Name: COSEXE, dtype: int64


In [37]:
print('0-17 ::\n', data[(0 <= data['QTAGCL']) & (data['QTAGCL'] < 18)].groupby('ECO')['QTAGCL'].count(), '\n')
print('18-39 ::\n', data[(18 <= data['QTAGCL']) & (data['QTAGCL'] < 40)].groupby('ECO')['QTAGCL'].count(), '\n')
print('40-64 ::\n', data[(40 <= data['QTAGCL']) & (data['QTAGCL'] < 65)].groupby('ECO')['QTAGCL'].count(), '\n')
print('65-~ ::\n', data[65 <= data['QTAGCL']].groupby('ECO')['QTAGCL'].count(), '\n')

0-17 ::
 ECO
False    97505
True       667
Name: QTAGCL, dtype: int64 

18-39 ::
 ECO
False    100192
True      35135
Name: QTAGCL, dtype: int64 

40-64 ::
 ECO
False    98117
True     75286
Name: QTAGCL, dtype: int64 

65-~ ::
 ECO
False    50551
True     48939
Name: QTAGCL, dtype: int64 



## Save it

In [45]:
ninja = ctr[ctr['COPRO'].isin(eco)][['COMAX', 'COPRO']]
ninja = ninja[ninja['COMAX'].isin(data[data['ECO'] == True]['COMAX'])]

In [62]:
one = set(ninja[ninja['COPRO'].isin(credit)]['COMAX'].values)
two = set(ninja[ninja['COPRO'].isin(depot)]['COMAX'].values)
three = set(ninja[ninja['COPRO'].isin(comm)]['COMAX'].values)