# Product Backorders

## Package Import and Configuration

In [0]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, KFold

In [0]:
# Display all columns in a DataFrame.
pd.set_option('display.max_columns', None)

## Data Import and Preprocessing

In [0]:
# Import Dataset with Product Backorders.
data = pd.read_csv('https://github.com/saschaschworm/dsb/blob/master/Data%20Sets/Demos%20and%20Exercises/backorders/backorder_sample.csv?raw=true')

### Data Preview

In [4]:
data.head()

Unnamed: 0,national_inv,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,potential_issue,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,0.0,0.0,3.0,4.0,6.0,1.0,1.0,3.0,3.0,0.0,0.0,0.0,0.17,0.33,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.63,0.72,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,605.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,90.0,0.0,0.0,0.0,0.94,0.95,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,18.0,0.0,37.0,72.0,117.0,8.0,30.0,31.0,31.0,10.0,0.0,0.0,0.83,0.86,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,12.0,18.0,18.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.94,0.95,0.0,0.0,0.0,1.0,1.0,0.0,1.0


### Data Preprocessing

There is not data preprocessing step in this example. The only thing we do here is to transform the features to a matrix $X$ and the target variable to a vector $y$.

In [0]:
X, y = data[data.columns[0:20]].values, data['went_on_backorder'].values

## Modelling and Evaluation

In [0]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

# Initialize 10-Fold-Cross-Validation.
k_fold = KFold(n_splits=10)

# Initialize Random Forest Model.
random_forest = RandomForestClassifier(n_estimators=10, random_state=1909, n_jobs=-1)

# Empty lists for persisting the performances measures calculated in each iteration. 
accuracies = []
precisions = []
recalls = []
f1s = []

for train_idx, test_idx in k_fold.split(X):
    # Split dataset into a train and test set.
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Run the Random Forest on the current training set.
    random_forest.fit(X_train, y_train)

    # Perform the Prediction on the Test-Set-Features.
    y_pred = random_forest.predict(X_test)

    # Calculate the performance measures on the current test set.
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append to performances measures to lists.
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

### Result

In [7]:
random_forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=1909, verbose=0,
            warm_start=False)

In [8]:
print(f'Average accuracy: {np.mean(accuracies) * 100:.2f}%')
print(f'Average precision: {np.mean(precisions) * 100:.2f}%')
print(f'Average recall: {np.mean(recalls) * 100:.2f}%')
print(f'Average F1: {np.mean(f1s) * 100:.2f}%')

Average accuracy: 90.37%
Average precision: 89.67%
Average recall: 91.26%
Average F1: 90.45%


## Grid Search Cross Validation

In [9]:
# Initialize a Hyperperameter Grid with various Settings around the default Hyperparameter Settings.
param_grid = {'n_estimators': [5, 15, 20], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4]}

# Create an empty Random Forest Model to be used during Grid Search Cross Validation.
gscv_random_forest = RandomForestClassifier()

# Initialize a 10-Fold Grid Search Cross Validation.
grid_search = GridSearchCV(estimator=gscv_random_forest, param_grid=param_grid, cv=10, n_jobs=-1, verbose=1)

# Perform the Grid Search.
grid_search.fit(X, y)

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  1.5min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [5, 15, 20], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

### Result

In [14]:
# Display the best Hyperparameter Set.
grid_search.best_params_

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}

In [0]:
# Retrieve the best estimator from all estimators generated during Grid Search CSV.
best_estimator = grid_search.best_estimator_

In [11]:
best_estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 10-Fold-Cross-Validation using the Best Estimator

In [0]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

# Initialize 10-Fold-Cross-Validation.
k_fold = KFold(n_splits=10)

# Empty lists for persisting the performances measures calculated in each iteration. 
accuracies = []
precisions = []
recalls = []
f1s = []

for train_idx, test_idx in k_fold.split(X):
    # Split dataset into a train and test set.
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Run the Random Forest with the Best Estimator on the current training set.
    best_estimator.fit(X_train, y_train)

    # Perform the Prediction on the Test-Set-Features.
    y_pred = random_forest.predict(X_test)

    # Calculate the performance measures on the current test set.
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append to performances measures to lists.
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

In [13]:
print(f'Average accuracy: {np.mean(accuracies) * 100:.2f}%')
print(f'Average precision: {np.mean(precisions) * 100:.2f}%')
print(f'Average recall: {np.mean(recalls) * 100:.2f}%')
print(f'Average F1: {np.mean(f1s) * 100:.2f}%')

Average accuracy: 98.31%
Average precision: 98.45%
Average recall: 98.17%
Average F1: 98.31%
