# Predictions

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [2]:
worker_order_df = pd.read_csv('cleanworkorder.csv')

In [3]:
job_df_predict = worker_order_df
job_df_predict = shuffle(job_df_predict, random_state=0)
job_df_predict = job_df_predict[0:10000]

X_wo_D = job_df_predict.loc[:, job_df_predict.columns != "Duration"]
X_wo_AP = job_df_predict.loc[:, job_df_predict.columns != "AffectedProduction"]
X_wo_GPL = job_df_predict.loc[:, job_df_predict.columns != "GrossProductionLoss"]
X_wo_DLF = job_df_predict.loc[:, job_df_predict.columns != "DaysFromLastFailure_int"]

Xs = [X_wo_D, X_wo_AP, X_wo_GPL, X_wo_DLF]
y = ["Duration","AffectedProduction","GrossProductionLoss","DaysFromLastFailure_int"]

for i in range(len(Xs)):
    # Normalization
    curr_norm_scaler = MinMaxScaler().fit(Xs[i])
    Xs[i] = curr_norm_scaler.transform(Xs[i])

    # Scaling
    curr_std_scaler = MinMaxScaler().fit(Xs[i])
    Xs[i] = curr_std_scaler.transform(Xs[i])


X_D_train, X_D_test, y_D_train, y_D_test = train_test_split(Xs[0], job_df_predict['Duration'], test_size=0.2, random_state=42)
X_AP_train, X_AP_test, y_AP_train, y_AP_test = train_test_split(Xs[1], job_df_predict['AffectedProduction'], test_size=0.2, random_state=42)
X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test = train_test_split(Xs[2], job_df_predict['GrossProductionLoss'], test_size=0.2, random_state=42)
X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test = train_test_split(Xs[3], job_df_predict['DaysFromLastFailure_int'], test_size=0.2, random_state=42)

splits = [[X_D_train, X_D_test, y_D_train, y_D_test], [X_AP_train, X_AP_test, y_AP_train, y_AP_test], [X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test], [X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test]]


In [39]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "K-Nearest Neighbors Regressor": KNeighborsRegressor(),
}

i = 0
for split in splits:
    print("Predicting: ", y[i])
    print("")
    for name, model in models.items():
        print("Training", name)

        model.fit(split[0], split[2])

        y_pred = model.predict(split[1])
        
        mse = mean_squared_error(split[3], y_pred)
        mae = mean_absolute_error(split[3], y_pred)
        r2 = r2_score(split[3], y_pred)
        print("MSE:", mse)
        # MAE = 0.5 -> This means that on average, the predicted target values are off by 0.5 units from the true target values.
        print("MAE:", mae)
        print("R2:", r2)
        print("")
    i+=1
    print("")
    print("")
    print("")

Predicting:  Duration

Training Logistic Regression
MSE: 196.97
MAE: 2.737
R2: -0.03953565134671955

Training Decision Tree Regressor
MSE: 135.892
MAE: 2.962
R2: 0.2828117036461978

Training Random Forest Regressor
MSE: 106.05943994999998
MAE: 2.844455
R2: 0.4402570493481671

Training Support Vector Regressor
MSE: 195.73241546852296
MAE: 2.770161181589232
R2: -0.033004132628002836

Training K-Nearest Neighbors Regressor
MSE: 164.17816
MAE: 3.2456000000000005
R2: 0.1335276920723667




Predicting:  AffectedProduction

Training Logistic Regression
MSE: 1692.4675
MAE: 4.2865
R2: 0.017781514364049178

Training Decision Tree Regressor
MSE: 3066.124
MAE: 6.633
R2: -0.7794159545468637

Training Random Forest Regressor
MSE: 1677.09968585
MAE: 5.089615
R2: 0.026700179651239386

Training Support Vector Regressor
MSE: 1667.30130660712
MAE: 4.103675836875183
R2: 0.032386639935781614

Training K-Nearest Neighbors Regressor
MSE: 1775.7131599999998
MAE: 5.2902000000000005
R2: -0.030529857110419334




### Duration Hyper Parammeter Tuning

In [86]:
forest_param_grid = {
    'n_estimators': [5, 10, 50, 100, 250],
    'max_features': [1.0, 'sqrt', 'log2'],
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': [ 2, 5, 10],
    'min_samples_leaf': [0.01, 0.1, 1, 3, 5],
    'bootstrap': [True, False]
}

forest_model = RandomForestRegressor(random_state=42)


grid_search =RandomizedSearchCV(forest_model, forest_param_grid, cv=4,  scoring='neg_mean_squared_error', verbose=-1, n_jobs=-1, n_iter=500, random_state=92)


grid_search.fit(splits[0][0], splits[0][2])


print("Best hyperparameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(splits[0][1])

mse = mean_squared_error(splits[0][3], y_pred)
mae = mean_absolute_error(splits[0][3], y_pred)
r2 = r2_score(splits[0][3], y_pred)

print("")

print("MSE:", mse)
print("MAE:", mae)
print("R2:", r2)



Best hyperparameters:  {'n_estimators': 250, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}

MSE: 111.71731769358604
MAE: 2.9210369714285718
R2: 0.4103968390348258


### Affected Production Hyper Parammeter Tuning

In [22]:
svr_param_grid = {
'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
'degree': [3, 5, 7, 10],
'C': [0.001, 0.01, 0.1, 1, 10],
'epsilon': [0.1, 0.01, 0.001],
'gamma': ['scale', 'auto']
}

support_vector_model = SVR()

grid_search =RandomizedSearchCV(support_vector_model, svr_param_grid, cv=2,  scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, n_iter=20, random_state=92)



grid_search.fit(splits[1][0], splits[1][2])


print("Best hyperparameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(splits[1][1])

mse = mean_squared_error(splits[1][3], y_pred)
mae = mean_absolute_error(splits[1][3], y_pred)
r2 = r2_score(splits[1][3], y_pred)

print("")

print("MSE:", mse)
print("MAE:", mae)
print("R2:", r2)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
Best hyperparameters:  {'kernel': 'poly', 'gamma': 'scale', 'epsilon': 0.001, 'degree': 5, 'C': 0.1}

MSE: 1636.5282070856938
MAE: 4.008512273918287
R2: 0.05024571682219725


### Gross Production Loss Hyper Parammeter Tuning

In [16]:
forest_param_grid = {
    'n_estimators': [5, 10, 50, 100, 250],
    'max_features': [1.0, 'sqrt', 'log2'],
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': [ 2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'bootstrap': [True, False]
}

forest_model = RandomForestRegressor(random_state=42)


grid_search =RandomizedSearchCV(forest_model, forest_param_grid, cv=4,  scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, n_iter=500, random_state=92)



grid_search.fit(splits[2][0], splits[2][2])


print("Best hyperparameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(splits[2][1])

mse = mean_squared_error(splits[2][3], y_pred)
mae = mean_absolute_error(splits[2][3], y_pred)
r2 = r2_score(splits[2][3], y_pred)

print("")

print("MSE:", mse)
print("MAE:", mae)
print("R2:", r2)



Fitting 4 folds for each of 500 candidates, totalling 2000 fits
Best hyperparameters:  {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 1.0, 'max_depth': None, 'bootstrap': True}

MSE: 416.3729526744609
MAE: 4.673825409337738
R2: 0.15768865710979807


### Days From Last Failure Hyper Parammeter Tuning

In [19]:
forest_param_grid = {
    'n_estimators': [5, 10, 50, 100, 250],
    'max_features': [1.0, 'sqrt', 'log2'],
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': [ 2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'bootstrap': [True, False]
}

forest_model = RandomForestRegressor(random_state=42)


grid_search =RandomizedSearchCV(forest_model, forest_param_grid, cv=4,  scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, n_iter=500, random_state=92)



grid_search.fit(splits[3][0], splits[3][2])


print("Best hyperparameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(splits[3][1])

mse = mean_squared_error(splits[3][3], y_pred)
mae = mean_absolute_error(splits[3][3], y_pred)
r2 = r2_score(splits[3][3], y_pred)

print("")

print("MSE:", mse)
print("MAE:", mae)
print("R2:", r2)



Fitting 4 folds for each of 500 candidates, totalling 2000 fits
Best hyperparameters:  {'n_estimators': 250, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 100, 'bootstrap': True}

MSE: 270971.19649087446
MAE: 318.4206034014656
R2: 0.3515847307487474
