---
title: "GIOZA Case Study: Production Analysis"
format:
  html:
    toc: true
    toc-title: Contents
    toc-depth: 4
    code-fold: show
    self-contained: true
jupyter: python3
---

# Loading Packages and Data

In [1]:
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from plotnine import *

from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
factory_prod = pd.read_csv('data/production.csv')
# convert predictors into proper datatype
factory_prod['Confirmation'] = factory_prod['Confirmation'].astype(str)
factory_prod['Ordre'] = factory_prod['Ordre'].astype(str)
factory_prod['Poste de travail'] = factory_prod['Poste de travail'].astype(str)
factory_prod['Centre de coûts'] = factory_prod['Centre de coûts'].astype(str)
factory_prod['sku'] = factory_prod['sku'].astype(str)

factory_prod['at'] = pd.to_datetime(factory_prod['at'])
factory_prod = factory_prod.sort_values(by=['at'], ascending=True).reset_index(drop=True)

print( factory_prod.dtypes )

Confirmation                     object
Ordre                            object
Poste de travail                 object
Centre de coûts                  object
sku                              object
units                             int64
weight_kg                       float64
scrap                             int64
group                            object
at                  datetime64[ns, UTC]
dtype: object


# Units

In [3]:
data_unit = factory_prod.drop(['Confirmation', 'Ordre', 'at'], axis=1)
data_unit_num = data_unit.select_dtypes(include=['int64', 'float64']).drop(['units'], axis=1)
data_unit_cat = data_unit.select_dtypes(exclude=['int64', 'float64'])
n_train = math.floor(0.9 * len(data_unit))

# split into train / test
trainX_num = data_unit_num[0:n_train]
trainY = data_unit[['units']][0:n_train]

testX_num = data_unit_num[n_train:]
testY = data_unit[['units']][n_train:]

# apply normalization and one-hot encoding
scaler = MinMaxScaler()
trainX_num = scaler.fit_transform(trainX_num)
testX_num = scaler.transform(testX_num)

data_unit_cat = pd.get_dummies(data_unit_cat)

# get final train and test set
data_scaled = pd.DataFrame(
    np.concatenate((np.concatenate((trainX_num, testX_num), axis=0), data_unit_cat), axis=1),
    columns=list(data_unit_num.columns) + list(data_unit_cat.columns)
)

trainX = data_scaled[0:n_train]
testX = data_scaled[n_train:]

print(trainX.shape, trainY.shape)
print(testX.shape, testY.shape)

(8443, 534) (8443, 1)
(939, 534) (939, 1)


In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [5]:
# fit decision tree model
model_tree = DecisionTreeRegressor(max_depth=10)
model_tree.fit(trainX, trainY)

# evaluate model with RMSE
train_preds_tree = model_tree.predict(trainX).reshape(-1,1)
print('Training Error:', math.sqrt( metrics.mean_squared_error( trainY, train_preds_tree ) ) )

test_preds_tree = model_tree.predict(testX).reshape(-1,1)
print('Testing Error:', math.sqrt( metrics.mean_squared_error( testY , test_preds_tree ) ) )

Training Error: 16423319.083621074
Testing Error: 15995566.473672068


In [6]:
model_tree_coefs = pd.DataFrame(
    model_tree.feature_importances_,
    index=list(trainX.columns), columns=['coef']
)
model_tree_coefs.sort_values(by=['coef'], key=abs, ascending=False)

Unnamed: 0,coef
weight_kg,0.46975
sku_1663717,0.1167535
group_4G130160,0.09400354
sku_762385,0.05804926
sku_762388,0.04459438
sku_1779728,0.0424952
sku_1099287,0.03534159
sku_1099285,0.02888065
sku_1689096,0.02853637
sku_1390940,0.02711134


In [7]:
# fit random forest model
model_forest = RandomForestRegressor(max_depth=10)
model_forest.fit(trainX, trainY)

# evaluate model with RMSE
train_preds_forest = model_forest.predict(trainX).reshape(-1,1)
print('Training Error:', math.sqrt( metrics.mean_squared_error( trainY, train_preds_tree ) ) )

test_preds_forest = model_forest.predict(testX).reshape(-1,1)
print('Testing Error:', math.sqrt( metrics.mean_squared_error( testY , test_preds_tree ) ) )

Training Error: 16423319.083621074
Testing Error: 15995566.473672068


In [8]:
model_forest_coefs = pd.DataFrame(
    model_forest.feature_importances_,
    index=list(trainX.columns), columns=['coef']
)
model_forest_coefs.sort_values(by=['coef'], key=abs, ascending=False)

Unnamed: 0,coef
weight_kg,0.4081432
sku_1663717,0.1156731
sku_1779728,0.05732722
group_4G130160,0.05079679
sku_762385,0.04698893
Poste de travail_40940,0.04364495
sku_762388,0.04050617
sku_1099285,0.02733238
sku_1099287,0.02443214
sku_1390940,0.02281073


# Weight

In [9]:
data_unit = factory_prod.drop(['Confirmation', 'Ordre', 'at'], axis=1)
data_unit_num = data_unit.select_dtypes(include=['int64', 'float64']).drop(['weight_kg'], axis=1)
data_unit_cat = data_unit.select_dtypes(exclude=['int64', 'float64'])
n_train = math.floor(0.9 * len(data_unit))

# split into train / test
trainX_num = data_unit_num[0:n_train]
trainY = data_unit[['weight_kg']][0:n_train]

testX_num = data_unit_num[n_train:]
testY = data_unit[['weight_kg']][n_train:]

# apply normalization and one-hot encoding
scaler = MinMaxScaler()
trainX_num = scaler.fit_transform(trainX_num)
testX_num = scaler.transform(testX_num)

data_unit_cat = pd.get_dummies(data_unit_cat)

# get final train and test set
data_scaled = pd.DataFrame(
    np.concatenate((np.concatenate((trainX_num, testX_num), axis=0), data_unit_cat), axis=1),
    columns=list(data_unit_num.columns) + list(data_unit_cat.columns)
)

trainX = data_scaled[0:n_train]
testX = data_scaled[n_train:]

print(trainX.shape, trainY.shape)
print(testX.shape, testY.shape)

(8443, 534) (8443, 1)
(939, 534) (939, 1)


In [10]:
# fit decision tree model
model_tree = DecisionTreeRegressor(max_depth=40)
model_tree.fit(trainX, trainY)

# evaluate model with RMSE
train_preds_tree = model_tree.predict(trainX).reshape(-1,1)
print('Training Error:', math.sqrt( metrics.mean_squared_error( trainY, train_preds_tree ) ) )

test_preds_tree = model_tree.predict(testX).reshape(-1,1)
print('Testing Error:', math.sqrt( metrics.mean_squared_error( testY , test_preds_tree ) ) )

Training Error: 83.0895632590248
Testing Error: 172.6312921178736


In [11]:
# fit random forest model
model_forest = RandomForestRegressor(max_depth=40)
model_forest.fit(trainX, trainY)

# evaluate model with RMSE
train_preds_forest = model_forest.predict(trainX).reshape(-1,1)
print('Training Error:', math.sqrt( metrics.mean_squared_error( trainY, train_preds_tree ) ) )

test_preds_forest = model_forest.predict(testX).reshape(-1,1)
print('Testing Error:', math.sqrt( metrics.mean_squared_error( testY , test_preds_tree ) ) )

Training Error: 83.0895632590248
Testing Error: 172.6312921178736


In [12]:
model_weight_coefs = pd.DataFrame(
    np.concatenate((model_tree.feature_importances_.reshape(-1,1), model_forest.feature_importances_.reshape(-1,1)), axis=1),
    index=list(trainX.columns), columns=['coef_tree', 'coef_forest']
)

model_weight_coefs.sort_values(by=['coef_forest'], key=abs, ascending=False)

Unnamed: 0,coef_tree,coef_forest
units,0.4938589,0.5274253
Poste de travail_40904,0.0,0.1446074
group_4G200100,0.3126822,0.1278273
sku_1633466,0.02981702,0.01988506
group_4G140400,0.02014445,0.01642238
group_4G140120,0.01669345,0.01575676
group_4G130410,0.01279142,0.01265414
group_4G130310,0.01208237,0.01112442
sku_1589487,0.01073258,0.007750892
Poste de travail_40443,0.0004243333,0.006021577


# Scrap

In [13]:
data_unit = factory_prod.drop(['Confirmation', 'Ordre', 'at'], axis=1)
data_unit_num = data_unit.select_dtypes(include=['int64', 'float64']).drop(['scrap'], axis=1)
data_unit_cat = data_unit.select_dtypes(exclude=['int64', 'float64'])
n_train = math.floor(0.9 * len(data_unit))

# split into train / test
trainX_num = data_unit_num[0:n_train]
trainY = data_unit[['scrap']][0:n_train]

testX_num = data_unit_num[n_train:]
testY = data_unit[['scrap']][n_train:]

# apply normalization and one-hot encoding
scaler = MinMaxScaler()
trainX_num = scaler.fit_transform(trainX_num)
testX_num = scaler.transform(testX_num)

data_unit_cat = pd.get_dummies(data_unit_cat)

# get final train and test set
data_scaled = pd.DataFrame(
    np.concatenate((np.concatenate((trainX_num, testX_num), axis=0), data_unit_cat), axis=1),
    columns=list(data_unit_num.columns) + list(data_unit_cat.columns)
)

trainX = data_scaled[0:n_train]
testX = data_scaled[n_train:]

print(trainX.shape, trainY.shape)
print(testX.shape, testY.shape)

(8443, 534) (8443, 1)
(939, 534) (939, 1)


In [14]:
# fit decision tree model
model_tree = DecisionTreeRegressor(max_depth=10)
model_tree.fit(trainX, trainY)

# evaluate model with RMSE
train_preds_tree = model_tree.predict(trainX).reshape(-1,1)
print('Training Error:', math.sqrt( metrics.mean_squared_error( trainY, train_preds_tree ) ) )

test_preds_tree = model_tree.predict(testX).reshape(-1,1)
print('Testing Error:', math.sqrt( metrics.mean_squared_error( testY , test_preds_tree ) ) )

Training Error: 1482.6301111050077
Testing Error: 6010.224338763769


In [15]:
# fit random forest model
model_forest = RandomForestRegressor(max_depth=10)
model_forest.fit(trainX, trainY)

# evaluate model with RMSE
train_preds_forest = model_forest.predict(trainX).reshape(-1,1)
print('Training Error:', math.sqrt( metrics.mean_squared_error( trainY, train_preds_tree ) ) )

test_preds_forest = model_forest.predict(testX).reshape(-1,1)
print('Testing Error:', math.sqrt( metrics.mean_squared_error( testY , test_preds_tree ) ) )

Training Error: 1482.6301111050077
Testing Error: 6010.224338763769


In [16]:
model_scrap_coefs = pd.DataFrame(
    np.concatenate((model_tree.feature_importances_.reshape(-1,1), model_forest.feature_importances_.reshape(-1,1)), axis=1),
    index=list(trainX.columns), columns=['coef_tree', 'coef_forest']
)

model_scrap_coefs.sort_values(by=['coef_forest'], key=abs, ascending=False)

Unnamed: 0,coef_tree,coef_forest
units,0.587793,0.4157451
weight_kg,0.297996,0.3298863
Centre de coûts_208006,0.010346,0.01111379
Poste de travail_40135,0.011138,0.01092105
sku_1397615,0.002364,0.009984234
sku_891546,0.010109,0.009597723
group_4G120410,0.0,0.009432141
sku_776288,0.004134,0.009322287
Poste de travail_40438,0.008688,0.008831412
group_4G120110,0.0,0.008733281
