In [2]:
# Load modules for data manipulation
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import call
import pandas as pd
import numpy as np
import time
import os

In [3]:
# Load modules for machine learning
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Calculate the accurancy of the model
from sklearn.metrics import accuracy_score, classification_report

# For drawing the graph
from sklearn.tree import export_graphviz

# will be used for tree visualization
from dtreeviz.trees import dtreeviz

In [4]:
load_dotenv()

True

## Algorithms

In [14]:
def random_forest(data, ecos, cols, var='ECO', importance=False):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data[var])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)

    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 300)

    # Train the model on training data
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features)
    
    # Classification report
    print(classification_report(test_labels, predictions.round()))

    # Get the best variables
    if importance:
        shit = sorted(zip(cols, rf.feature_importances_), key=lambda x: x[1], reverse=True)
        for i in shit:
            print(i)

In [6]:
def knn(data, ecos, cols, var='ECO'):
    ''' The Model'''
    
    # Labels are the values we want to predict
    labels = np.array(data[var])

    # Remove the labels from the features
    features = np.array(data[cols])

    # Split the data into training and testing sets, random_state = 42
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state=42)

    # Instantiate model with 1000 decision trees
    model = KNeighborsClassifier(n_neighbors=3)
    
    # Train the model on training data
    model.fit(train_features, train_labels)
    
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)
    
    # Classification report
    print(classification_report(test_labels, predictions.round()))

## Execute algorithms

In [7]:
# Get the data from the other notebook
%store -r data
%store -r columns
%store -r eco
%store -r credit
%store -r depot

## For all products

In [12]:
knn(data, eco, columns)

              precision    recall  f1-score   support

       False       0.83      0.87      0.85     84797
        True       0.59      0.52      0.55     31359

    accuracy                           0.77    116156
   macro avg       0.71      0.69      0.70    116156
weighted avg       0.77      0.77      0.77    116156



In [15]:
random_forest(data, eco, columns, importance=True)

              precision    recall  f1-score   support

       False       0.90      0.91      0.91     84955
        True       0.75      0.73      0.74     31201

    accuracy                           0.86    116156
   macro avg       0.83      0.82      0.82    116156
weighted avg       0.86      0.86      0.86    116156

('SGMIM', 0.16917231175993028)
('CEBPF5', 0.12512523790614685)
('MTEEML', 0.09187939701435839)
('QTAGCL', 0.05291748459191832)
('MTECSL', 0.03706872733157467)
('MTRSMO', 0.03427950544940391)
('LON', 0.03367482470088518)
('MTELEP', 0.03301773248881539)
('LAT', 0.03283548040552195)
('MCTOTA', 0.029386878424200187)
('MTRSFI', 0.02136942921970259)
('CTSCPI', 0.020211347776412845)
('CTCOPO', 0.019513192444741347)
('QCCOCY', 0.016160794954376924)
('CTMENB', 0.01283147434196038)
('CTSC91', 0.012499018963000399)
('MCTOTE', 0.012312043747551762)
('CTSC90', 0.011309297925802774)
('MTESOC', 0.01063140838197331)
('CTSIFA', 0.010610416145568013)
('MTPATR', 0.010117555259462863)

## For credit products

In [13]:
knn(data, credit, columns, var='ECOC')

              precision    recall  f1-score   support

       False       0.99      1.00      1.00    115385
        True       0.18      0.04      0.07       771

    accuracy                           0.99    116156
   macro avg       0.58      0.52      0.53    116156
weighted avg       0.99      0.99      0.99    116156



In [16]:
random_forest(data, credit, columns, var='ECOC', importance=True)

              precision    recall  f1-score   support

       False       0.99      1.00      1.00    115360
        True       0.59      0.06      0.11       796

    accuracy                           0.99    116156
   macro avg       0.79      0.53      0.55    116156
weighted avg       0.99      0.99      0.99    116156

('MTCDIM', 0.09061565172076949)
('MTRECD', 0.04815603229593861)
('QCCRIM', 0.04445154450295985)
('LAT', 0.03864027098980306)
('LON', 0.03837847156206433)
('QTAGCL', 0.03825659459347242)
('MTECIM', 0.034663223586472156)
('MTEEML', 0.034310036821454365)
('MCTOTE', 0.030973848481371354)
('MCTOTA', 0.030575619719754994)
('MTRETT', 0.028696332114649958)
('MTECSL', 0.02792459394245652)
('MTRSMO', 0.027550238432676833)
('MTPATR', 0.02244380182991973)
('QCCOCY', 0.02200250256355965)
('CTSCPI', 0.02144985108333101)
('CTCOPO', 0.020124670983617043)
('MTECCS', 0.017969363708976258)
('CTSC91', 0.017213009849635662)
('MTEASV', 0.017022937248883898)
('MTEPEL', 0.0168403740758437

## For depot products

In [10]:
knn(data, depot, columns, var='ECOD')

              precision    recall  f1-score   support

       False       0.83      0.87      0.85     85184
        True       0.59      0.51      0.55     30972

    accuracy                           0.77    116156
   macro avg       0.71      0.69      0.70    116156
weighted avg       0.77      0.77      0.77    116156



In [17]:
random_forest(data, depot, columns, var='ECOD', importance=True)

              precision    recall  f1-score   support

       False       0.90      0.91      0.91     85227
        True       0.75      0.73      0.74     30929

    accuracy                           0.86    116156
   macro avg       0.83      0.82      0.82    116156
weighted avg       0.86      0.86      0.86    116156

('SGMIM', 0.16695481613453478)
('CEBPF5', 0.12590181938977366)
('MTEEML', 0.09299477708601606)
('QTAGCL', 0.05310248517844704)
('MTECSL', 0.03865420332513071)
('MTRSMO', 0.03523654017254471)
('MTELEP', 0.03402926334977182)
('LON', 0.03353846221317383)
('LAT', 0.03288487141852338)
('MCTOTA', 0.029745418145315925)
('MTRSFI', 0.021108441244347874)
('CTSCPI', 0.020257304354977242)
('CTCOPO', 0.01964872125545484)
('QCCOCY', 0.016175514515697546)
('CTMENB', 0.012861546851780396)
('CTSC91', 0.012361461593669543)
('MCTOTE', 0.012246241439416078)
('CTSC90', 0.0111673811577332)
('MTESOC', 0.01068031911470799)
('CTSIFA', 0.01046427678237946)
('MTPATR', 0.009902522438013218)
(