In [1]:
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os

In [2]:
load_dotenv()

True

In [12]:
clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'clients.csv', encoding='ISO-8859-1', sep='\t')
ctr = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj7s.csv', encoding='ISO-8859-1', sep=';', low_memory=False)

In [13]:
# theses are the only columns that matters
ctr = ctr[['COCO', 'COMAX', 'COPRO' ]]

In [14]:
ctr = ctr[ctr['COMAX'].isin(clt['COMAX'])]

In [21]:
ctr.shape

(32773662, 3)

In [61]:
# ECO: CODEVAIR
ids = list(ctr[ctr['COPRO'] == '07649']['COMAX'].values)

# NON-ECO: Direct & Proche Référencement
ids.extend(list(ctr[ctr['COPRO'] == '02716']['COMAX'].values))

In [63]:
clt = clt[clt['COMAX'].isin(ids)]

In [65]:
clt.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE,COPOST
764,b0ce1a9aae,63,1,7300,1,2018-07-02,9999-01-01,34990
1521,141c7a4de2,59,1,3400,1,2018-10-22,9999-01-01,66120
1613,03834c8de8,83,0,7600,1,2015-12-09,9999-01-01,11000
2144,98846a5d47,79,0,7200,1,2013-10-10,9999-01-01,11100
2304,9ac220a4f2,87,0,7100,1,2015-11-23,9999-01-01,66140


In [79]:
clt['ECO'] = clt['COMAX'].isin(ctr[ctr['COPRO'] == '07649']['COMAX'].values)

In [80]:
clt.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE,COPOST,ECO
764,b0ce1a9aae,63,1,7300,1,2018-07-02,9999-01-01,34990,True
1521,141c7a4de2,59,1,3400,1,2018-10-22,9999-01-01,66120,True
1613,03834c8de8,83,0,7600,1,2015-12-09,9999-01-01,11000,True
2144,98846a5d47,79,0,7200,1,2013-10-10,9999-01-01,11100,True
2304,9ac220a4f2,87,0,7100,1,2015-11-23,9999-01-01,66140,True


In [82]:
eco = clt[clt['ECO'] == True]

In [84]:
eco['AGE'].describe()

count    2009.000000
mean       59.423096
std        16.027652
min        19.000000
25%        48.000000
50%        60.000000
75%        70.000000
max       101.000000
Name: AGE, dtype: float64

In [85]:
eco['CTSCPI'].unique()

array([7300, 3400, 7600, 7200, 7100, 5500, 8500, 6300, 8100, 2300, 5400,
       3100, 5200, 8400, 3300, 3800, 5300, 8600, 4600, 6400, 2100, 4300,
       3700, 6500, 4200, 4700, 3500, 4800, 1100, 6200, 7500, 5600, 6900,
       2200, 4500, 1200, 6700, 6800, 7800])

In [86]:
insee = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj10.csv', encoding='ISO-8859-1', sep=';', low_memory=False)

In [90]:
insee
for val in eco['CTSCPI'].unique():
    print(val, insee[insee['CTSCPI'] == val]['LISCPI'].values[0])

7300 ANCIENS CADRES PROFESSIONS INTER
3400 PROFESSEURS PROF SCIENTIFIQUES  
7600 ANCIENS EMPLOYES ET OUVRIERS    
7200 ANCIENS ARTIS COMMERC CHEF ENTRE
7100 ANCIENS AGRICULTEURS EXPLOITANTS
5500 EMPLOYES DE COMMERCE            
8500 INACTIFS DIVERS SAUF RET - 60ANS
6300 OUVRIER QUALIFIE DE L'ARTISANAT 
8100 CHOMEURS                        
2300 CHEFS D'ENTREPRISES             
5400 EMPLOYES ADMI.ENTREPRISE PRIVEE 
3100 PROFESSIONS LIBERALES           
5200 EMPL CIVILS AGENTS FONCT PUBLIQ 
8400 ELEVES ETUDIANTS                
3300 CADRES DE LA FONCTION PUBLIQUE  
3800 INGENIEURS CADRES TECHNI ENTREP 
5300 POLIC.MILIT.GEND.POMPIER GARDE  
8600 INACTIFS DIVERS SAUF RET +60ANS 
4600 PROFESSIONS ADM COMMERCIAL PRIVE
6400 CHAUFFEURS CONDUCTEURS ROUTIERS 
2100 ARTISANS                        
4300 PROFESSIONS SANTE TRAVAIL SOCIAL
3700 CADRES ADMN COMMERC ENTREPRISES 
6500 OUV QUAL MANUT MAGASIN TRANSPORT
4200 INSTITUTEURS ET ASSIMILES       
4700 TECHNICIENS                     
3500 PROFESS

In [91]:
eco['COSEXE'].value_counts(normalize=True)

0    0.509209
1    0.490791
Name: COSEXE, dtype: float64

In [95]:
eco['COPOST'].value_counts(normalize=True)

11100    0.035839
66000    0.034345
30000    0.031857
34000    0.031857
30900    0.027875
           ...   
7330     0.000498
11420    0.000498
97435    0.000498
97434    0.000498
30720    0.000498
Name: COPOST, Length: 412, dtype: float64

# Model

In [96]:
# Labels are the values we want to predict
labels = np.array(clt['ECO'])

In [99]:
# Remove the labels from the features
features = np.array(clt[['AGE', 'COSEXE', 'CTSCPI', 'CESITC', 'COPOST']])

In [100]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

In [101]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [102]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

In [103]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [105]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.0 degrees.


In [106]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.67 %.
