In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os

In [38]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

In [3]:
load_dotenv()

True

In [4]:
eco = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'eco-products.csv', encoding='ISO-8859-1', sep='\t')
clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ38.csv', encoding='ISO-8859-1', sep='\t')
ctr = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ7S.csv', encoding='ISO-8859-1', sep='\t', low_memory=False)

## Get only the physical clients

In [5]:
# theses are the only columns that matters
ctr = ctr[['COCO', 'COMAX', 'COPRO' ]]

In [6]:
ctr = ctr[ctr['COMAX'].isin(clt['COMAX'])]

In [7]:
ctr.shape

(32773662, 3)

## Get the best ecological product

In [8]:
eco = [str(i).zfill(5) for i in eco['COPRO']]

In [9]:
## ECO: 00003 | 242568 - LIVRET DEVELOPPEMENT DURABLE ET SOLIDAIRE
## DEPOTS MONETAIRES - EPARGNE MONETAIRE LIQUIDE - LIVRETS REGLEMENTES
ctr[ctr['COPRO'].isin(eco)]['COPRO'].value_counts()

00003    242568
00546      5141
00548      4785
07649      2724
00940      2166
00509      1876
00547      1027
00941       388
03992       101
00565        83
06458        75
Name: COPRO, dtype: int64

## Concatinate with other tables TJ39 - TJDR - TJER

In [10]:
tj39 = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj39.csv', encoding='ISO-8859-1', sep=';')
tjdr = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjdr.csv', encoding='ISO-8859-1', sep=';')
tjer = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tjer.csv', encoding='ISO-8859-1', sep=';')

In [11]:
tj39 = tj39[['COMAX', 'MSMENC', 'MTPATR', 'MTVAOP']]
tjdr = tjdr[['COMAX', 'MTRVIM', 'MTRVFR', 'QTPAFI', 'COHAVI']]
tjer = tjer[['COMAX', 'MTAPJE', 'MTCPJE', 'MTPJE']]

In [12]:
tj39['COMAX'] = [val[:10] for val in tj39['COMAX'].values]

In [13]:
# pour le cas de la table tj39, la variable COMAX elle est de 13 caracteres
# or celle des autres tables sont de 10, donc il fallait convertir de 13 a 10
# on prend les 10 premiers caractères du COMAX de tj39, d'apres le data engineer
# de la PBS.
clt[clt['COMAX'] == '1ce15b6b35']

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI
565671,1ce15b6b35,59,1,2300


In [14]:
# This table is only good fot the moral clients only
data = pd.merge(clt,tj39,on='COMAX',how='left')
data = pd.merge(data,tjer,on='COMAX',how='left')
data = pd.merge(data,tjdr,on='COMAX',how='left')

In [15]:
data = data.dropna()

In [16]:
data = data.drop_duplicates(subset=['COMAX'])

In [17]:
data.shape

(2998, 14)

## les avec le 00003

In [18]:
ids = ctr[ctr['COPRO'] == '00003']['COMAX'].values

In [19]:
data['ECO'] = data['COMAX'].isin(ids)

In [20]:
data.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,MSMENC,MTPATR,MTVAOP,MTAPJE,MTCPJE,MTPJE,MTRVIM,MTRVFR,QTPAFI,COHAVI,ECO
169,07a5ef7d08,42,1,3400,0.0,131000.0,131.0,0.0,0.0,2500.0,1.0,0.0,0.0,1.0,False
974,bf8d8de347,45,0,6300,0.0,99000.0,99.0,0.0,0.0,0.0,0.0,29134.0,3.0,1.0,False
2718,76d59b11a9,60,0,1100,0.0,400000.0,400.0,0.0,0.0,81832.88,0.0,0.0,0.0,1.0,True
2747,5115550eeb,27,1,1100,0.0,0.0,0.0,0.0,0.0,6000.0,0.0,5954.0,1.0,1.0,True
3034,28ffeda2ef,44,1,4200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23776.0,2.0,1.0,False


## Model

In [26]:
# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[data.columns[1:-1]])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [48]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Classification report
print(classification_report(test_labels, predictions.round()))

              precision    recall  f1-score   support

       False       0.39      0.23      0.29       175
        True       0.73      0.85      0.79       425

    accuracy                           0.67       600
   macro avg       0.56      0.54      0.54       600
weighted avg       0.63      0.67      0.64       600



### This time without deleting the nan values

In [61]:
# This table is only good fot the moral clients only
data = pd.merge(clt,tj39,on='COMAX',how='left')
data = pd.merge(data,tjer,on='COMAX',how='left')
data = pd.merge(data,tjdr,on='COMAX',how='left')

In [62]:
data = data.drop_duplicates(subset=['COMAX'])

In [64]:
data.fillna(data.mean(), inplace=True)

In [65]:
data.shape

(656954, 14)

In [67]:
data['MTPATR']

0          0.000000e+00
3          8.700000e+04
7          3.039181e+07
8          3.039181e+07
9          3.039181e+07
               ...     
2039737    3.039181e+07
2039738    3.039181e+07
2039739    3.039181e+07
2039740    3.039181e+07
2039741    1.300000e+05
Name: MTPATR, Length: 656954, dtype: float64

In [68]:
data['ECO'] = data['COMAX'].isin(ids)

In [69]:
data.head(2)

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,MSMENC,MTPATR,MTVAOP,MTAPJE,MTCPJE,MTPJE,MTRVIM,MTRVFR,QTPAFI,COHAVI,ECO
0,6e3a2b9fa1,55,1,4600,0.0,0.0,0.0,787.452407,0.0,24581.208182,246.968502,9004.050139,0.923987,1.162566,False
3,1b44a67f61,41,0,4700,0.0,87000.0,87.0,787.452407,0.0,24581.208182,246.968502,9004.050139,0.923987,1.162566,False


In [70]:
# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[data.columns[1:-1]])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [71]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Classification report
print(classification_report(test_labels, predictions.round()))

              precision    recall  f1-score   support

       False       0.75      0.90      0.82     93330
        True       0.52      0.27      0.36     38061

    accuracy                           0.72    131391
   macro avg       0.64      0.59      0.59    131391
weighted avg       0.69      0.72      0.69    131391

