In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import call
import pandas as pd
import numpy as np
import time
import os

In [2]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

# For drawing the graph
from sklearn.tree import export_graphviz

In [29]:
# Declare the list of products for each family
# Les familles sont: CREDITS, DEPOTS MONETAIRES, PRODUITS EXTERNES COMMERCIALISES
credit = ['00568', '00943', '00942', '00546', '00547', '03992', '00940', '00941', '00548', '06458', '00509', '00565']
depot = ['07648', '07649', '07606', '00003']
comm = ['05808', '05807']

In [3]:
def plot_feature_importance(importance,names,model_type):
    #Create arrays from feature importance and feature name logistic regressions
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + '-FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
    plt.savefig('{}.svg'.format(model_type))

In [32]:
def random_forest(data, ecos, cols):   
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 300)

    # Train the model on training data
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))  

In [4]:
def log_reg(data, ecos, cols):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    logisticRegr = LogisticRegression()
    
    # Train the model on training data
    logisticRegr.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = logisticRegr.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [5]:
def knn(data, ecos, cols):
    ''' The Model'''

    # Labels are the values we want to predict
    labels = np.array(data['ECO'])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    model = KNeighborsClassifier(n_neighbors=5)
    
    # Train the model on training data
    model.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)

    # Classification report
    print(classification_report(test_labels, predictions.round()))

In [17]:
# Get the data from the other notebook
%store -r data
%store -r cols
%store -r eco
%store -r ctr

# -------------------------------------------------------------------------------------------------------------

In [7]:
knn(data, eco, cols)

              precision    recall  f1-score   support

       False       0.81      0.86      0.84     69151
        True       0.65      0.58      0.61     32128

    accuracy                           0.77    101279
   macro avg       0.73      0.72      0.72    101279
weighted avg       0.76      0.77      0.77    101279



In [63]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[cols])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [64]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 300)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

2692.503211736679
13.138068914413452
              precision    recall  f1-score   support

       False       0.88      0.91      0.90     69304
        True       0.79      0.74      0.76     31975

    accuracy                           0.86    101279
   macro avg       0.84      0.82      0.83    101279
weighted avg       0.85      0.86      0.85    101279



In [73]:
shit = sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True)

In [74]:
for i in shit[:10]:
    print(i[0])

SGMIM
CEBPF5
MTEEML
QTAGCL
MTECSL
LON
LAT
MTRSMO
MCTOTA
MTELEP


# ----------------------------------------------------------------------------------------------------------------------

In [59]:
temp = data.copy()

# CREDIT

In [42]:
# drop the ECO, to generate the new one
temp = temp.drop(columns=['ECO'])

# get the ids of the products
ids = ctr[ctr['COPRO'].isin(credit)]['COMAX'].values
temp['ECO'] = temp['COMAX'].isin(ids)

In [36]:
random_forest(temp, credit, cols)

              precision    recall  f1-score   support

       False       0.99      1.00      0.99    100130
        True       0.60      0.09      0.16      1149

    accuracy                           0.99    101279
   macro avg       0.80      0.54      0.58    101279
weighted avg       0.99      0.99      0.98    101279



In [43]:
knn(temp, credit, cols)

              precision    recall  f1-score   support

       False       0.99      1.00      0.99    100078
        True       0.37      0.07      0.11      1201

    accuracy                           0.99    101279
   macro avg       0.68      0.53      0.55    101279
weighted avg       0.98      0.99      0.98    101279



# DEPOTS MONETAIRES

In [37]:
# drop the ECO, to generate the new one
temp = temp.drop(columns=['ECO'])

# get the ids of the products
ids = ctr[ctr['COPRO'].isin(depot)]['COMAX'].values
temp['ECO'] = temp['COMAX'].isin(ids)

In [39]:
random_forest(temp, depot, cols)

              precision    recall  f1-score   support

       False       0.89      0.92      0.90     69772
        True       0.80      0.74      0.77     31507

    accuracy                           0.86    101279
   macro avg       0.84      0.83      0.83    101279
weighted avg       0.86      0.86      0.86    101279



In [41]:
knn(temp, depot, cols)

              precision    recall  f1-score   support

       False       0.82      0.86      0.84     69686
        True       0.65      0.57      0.61     31593

    accuracy                           0.77    101279
   macro avg       0.73      0.72      0.72    101279
weighted avg       0.76      0.77      0.77    101279



# Test The theory of Audrey

In [60]:
temp = temp[temp['QTAGCL'] >= 18]

In [61]:
random_forest(temp, eco, cols)

              precision    recall  f1-score   support

       False       0.84      0.88      0.86     49683
        True       0.79      0.75      0.77     31961

    accuracy                           0.82     81644
   macro avg       0.82      0.81      0.81     81644
weighted avg       0.82      0.82      0.82     81644



# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

In [76]:
''' The Model'''

# Labels are the values we want to predict
labels = np.array(data['ECO'])

# Remove the labels from the features
features = np.array(data[['SGMIM', 'CEBPF5', 'MTEEML', 'QTAGCL', 'MTECSL', 'LON', 'LAT', 'MTRSMO', 'MCTOTA', 'MTELEP']])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

In [77]:
# Instantiate model with 300 decision trees
rf = RandomForestRegressor(n_estimators = 300)

st = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

fn = time.time()
print(fn - st)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

print(time.time() - fn)

# Classification report
print(classification_report(test_labels, predictions.round()))

714.0915184020996
10.427082300186157
              precision    recall  f1-score   support

       False       0.88      0.90      0.89     69387
        True       0.77      0.73      0.75     31892

    accuracy                           0.85    101279
   macro avg       0.82      0.81      0.82    101279
weighted avg       0.84      0.85      0.84    101279

