In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import call
import pandas as pd
import numpy as np
import time
import os

In [147]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

# For drawing the graph
from sklearn.tree import export_graphviz

# will be used for tree visualization
from dtreeviz.trees import dtreeviz

In [29]:
# Declare the list of products for each family
# Les familles sont: CREDITS, DEPOTS MONETAIRES, PRODUITS EXTERNES COMMERCIALISES
credit = ['00568', '00943', '00942', '00546', '00547', '03992', '00940', '00941', '00548', '06458', '00509', '00565']
depot = ['07648', '07649', '07606', '00003']
comm = ['05808', '05807']

In [3]:
def plot_feature_importance(importance,names,model_type):
    #Create arrays from feature importance and feature name logistic regressions
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + '-FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
    plt.savefig('{}.svg'.format(model_type))

In [86]:
def random_forest(data, ecos, cols):   
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 300)

    # Train the model on training data
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))
    
    # Get the best variables
    shit = sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True)
    for i in shit:
        print(i)

In [79]:
def log_reg(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    logisticRegr = LogisticRegression()
    
    # Train the model on training data
    logisticRegr.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = logisticRegr.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [5]:
def knn(data, ecos, cols):
    ''' The Model'''

    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    model = KNeighborsClassifier(n_neighbors=5)
    
    # Train the model on training data
    model.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [17]:
# Get the data from the other notebook
%store -r data
%store -r cols
%store -r eco
%store -r ctr

# -------------------------------------------------------------------------------------------------------------

In [7]:
knn(data, eco, cols)

              precision    recall  f1-score   support

       False       0.81      0.86      0.84     69151
        True       0.65      0.58      0.61     32128

    accuracy                           0.77    101279
   macro avg       0.73      0.72      0.72    101279
weighted avg       0.76      0.77      0.77    101279



In [105]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[cols])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [106]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 100)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

748.1507966518402
4.549225330352783
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69265
        True       0.79      0.73      0.76     32014

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg       0.85      0.86      0.85    101279



In [73]:
shit = sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True)

In [74]:
for i in shit[:10]:
    print(i[0])

SGMIM
CEBPF5
MTEEML
QTAGCL
MTECSL
LON
LAT
MTRSMO
MCTOTA
MTELEP


# ----------------------------------------------------------------------------------------------------------------------

# CREDIT

In [97]:
# Copy the data
temp = data.copy()

# drop the ECO, to generate the new one
temp = temp.drop(columns=['ECO'])

# get the ids of the products
ids = ctr[ctr['COPRO'].isin(credit)]['COMAX'].values
temp['ECO'] = temp['COMAX'].isin(ids)

In [98]:
random_forest(temp, credit, cols)

              precision    recall  f1-score   support

       False       0.99      1.00      0.99    100109
        True       0.47      0.06      0.11      1170

    accuracy                           0.99    101279
   macro avg       0.73      0.53      0.55    101279
weighted avg       0.98      0.99      0.98    101279

('MTCDIM', 0.10479987212900707)
('QCCRIM', 0.04374009975577079)
('LAT', 0.0429505400390894)
('LON', 0.04107389691017614)
('QTAGCL', 0.03847015163331383)
('MTRECD', 0.03492262537081712)
('MTEEML', 0.034146770492524514)
('MCTOTA', 0.032795507703260444)
('MTRSMO', 0.02923518112755668)
('MTRETT', 0.0279355348061196)
('MCTOTE', 0.027846462541866353)
('MTPATR', 0.027421111387807427)
('MTECSL', 0.025275713127085632)
('MTECIM', 0.02325049466187304)
('CTSCPI', 0.022045092253425207)
('QCCOCY', 0.021676655638697025)
('CTCOPO', 0.020443102190693868)
('MTEASV', 0.017947792654111165)
('MTESOC', 0.017711770277922353)
('CTSC91', 0.017620894054295327)
('MTRSFI', 0.01758399029892446

In [99]:
knn(temp, credit, cols)

              precision    recall  f1-score   support

       False       0.99      1.00      0.99    100078
        True       0.37      0.07      0.11      1201

    accuracy                           0.99    101279
   macro avg       0.68      0.53      0.55    101279
weighted avg       0.98      0.99      0.98    101279



# DEPOTS MONETAIRES

In [100]:
# Copy the data
temp = data.copy()

# drop the ECO, to generate the new one
temp = temp.drop(columns=['ECO'])

# get the ids of the products
ids = ctr[ctr['COPRO'].isin(depot)]['COMAX'].values
temp['ECO'] = temp['COMAX'].isin(ids)

In [101]:
random_forest(temp, depot, cols)

              precision    recall  f1-score   support

       False       0.89      0.92      0.90     69923
        True       0.80      0.74      0.77     31356

    accuracy                           0.86    101279
   macro avg       0.84      0.83      0.83    101279
weighted avg       0.86      0.86      0.86    101279

('SGMIM', 0.1725416468852227)
('CEBPF5', 0.12638723715605574)
('MTEEML', 0.08852332558514217)
('QTAGCL', 0.06351368195984274)
('MTECSL', 0.03825161526827269)
('LON', 0.034166722976683425)
('LAT', 0.03394054034085259)
('MTRSMO', 0.03330451866912964)
('MTELEP', 0.028525812890119327)
('MCTOTA', 0.028428317989679135)
('MTRSFI', 0.02394788469101665)
('CTSCPI', 0.020293946546187956)
('CTCOPO', 0.020183383326299396)
('QCCOCY', 0.01539253172668944)
('CTMENB', 0.013686607634675222)
('CTSC91', 0.01222715724599037)
('MCTOTE', 0.01188829406759302)
('MTESOC', 0.010511451603358247)
('CTSC90', 0.010484090665415178)
('CTSIFA', 0.01028719589927404)
('MTPATR', 0.009939555083450884)


In [102]:
knn(temp, depot, cols)

              precision    recall  f1-score   support

       False       0.82      0.86      0.84     69686
        True       0.65      0.57      0.61     31593

    accuracy                           0.77    101279
   macro avg       0.73      0.72      0.72    101279
weighted avg       0.76      0.77      0.77    101279



# Test The theory of Audrey

In [60]:
# Copy the data
temp = data.copy()

# Delete age below then 18
temp = temp[temp['QTAGCL'] >= 18]

In [61]:
random_forest(temp, eco, cols)

              precision    recall  f1-score   support

       False       0.84      0.88      0.86     49683
        True       0.79      0.75      0.77     31961

    accuracy                           0.82     81644
   macro avg       0.82      0.81      0.81     81644
weighted avg       0.82      0.82      0.82     81644



In [83]:
knn(temp, eco, cols)

              precision    recall  f1-score   support

       False       0.75      0.80      0.77     49619
        True       0.65      0.58      0.61     32025

    accuracy                           0.72     81644
   macro avg       0.70      0.69      0.69     81644
weighted avg       0.71      0.72      0.71     81644



# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

In [84]:
knn(data, eco, ['SGMIM', 'CEBPF5', 'MTEEML', 'QTAGCL', 'MTECSL', 'LON', 'LAT', 'MTRSMO', 'MCTOTA', 'MTELEP'])

              precision    recall  f1-score   support

       False       0.83      0.87      0.85     69151
        True       0.70      0.63      0.66     32128

    accuracy                           0.79    101279
   macro avg       0.76      0.75      0.76    101279
weighted avg       0.79      0.79      0.79    101279



In [142]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[['SGMIM', 'CEBPF5', 'MTEEML', 'QTAGCL', 'MTECSL', 'LON', 'LAT', 'MTRSMO', 'MCTOTA', 'MTELEP']])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [154]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 100, max_depth=5)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

61.757065534591675
0.48086094856262207
              precision    recall  f1-score   support

       False       0.88      0.85      0.86     69077
        True       0.69      0.75      0.72     32202

    accuracy                           0.81    101279
   macro avg       0.79      0.80      0.79    101279
weighted avg       0.82      0.81      0.82    101279



In [156]:
export_graphviz(rf.estimators_[0], out_file='tree.dot', feature_names = ['SGMIM', 'CEBPF5', 'MTEEML', 'QTAGCL', 'MTECSL', 'LON', 'LAT', 'MTRSMO', 'MCTOTA', 'MTELEP'])