# Predictions

In [23]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, train_test_split

In [24]:
worker_order_df = pd.read_csv('workorderdata/cleanworkerorder.csv')

In [25]:
job_df_predict = worker_order_df
job_df_predict = shuffle(job_df_predict, random_state=0)
job_df_predict = job_df_predict[0:10000]

X_wo_D = job_df_predict.loc[:, job_df_predict.columns != "Duration"]
X_wo_AP = job_df_predict.loc[:, job_df_predict.columns != "AffectedProduction"]
X_wo_GPL = job_df_predict.loc[:, job_df_predict.columns != "GrossProductionLoss"]
X_wo_DLF = job_df_predict.loc[:, job_df_predict.columns != "DaysFromLastFailure_int"]

Xs = [X_wo_D, X_wo_AP, X_wo_GPL, X_wo_DLF]
y = ["Duration","AffectedProduction","GrossProductionLoss","DaysFromLastFailure_int"]

for i in range(len(Xs)):
    # Normalization
    curr_norm_scaler = MinMaxScaler().fit(Xs[i])
    Xs[i] = curr_norm_scaler.transform(Xs[i])

    # Scaling
    curr_std_scaler = MinMaxScaler().fit(Xs[i])
    Xs[i] = curr_std_scaler.transform(Xs[i])


X_D_train, X_D_test, y_D_train, y_D_test = train_test_split(Xs[0], job_df_predict['Duration'], test_size=0.2, random_state=42)
X_AP_train, X_AP_test, y_AP_train, y_AP_test = train_test_split(Xs[1], job_df_predict['AffectedProduction'], test_size=0.2, random_state=42)
X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test = train_test_split(Xs[2], job_df_predict['GrossProductionLoss'], test_size=0.2, random_state=42)
X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test = train_test_split(Xs[3], job_df_predict['DaysFromLastFailure_int'], test_size=0.2, random_state=42)

splits = [[X_D_train, X_D_test, y_D_train, y_D_test], [X_AP_train, X_AP_test, y_AP_train, y_AP_test], [X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test], [X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test]]


In [26]:
models = {
    "Linear Regression": LinearRegression(),
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Random Forest Regressor 2": RandomForestRegressor(n_estimators=500, random_state=42), 
    "Support Vector Regressor": SVR(),
    "Support Vector Regressor 2": SVR(kernel= 'rbf', C=100, gamma=0.1, epsilon=0.1),
    "Support Vector Regressor 3": SVR(kernel='linear', C=1, epsilon=0.2, max_iter=10000),
    "Support Vector Regressor 4": SVR(kernel='poly', degree=3, C=10, epsilon=0.1),
    "K-Nearest Neighbors Regressor": KNeighborsRegressor(),
}
i = 0
for split in splits:
    print("Predicting: ", y[i])
    print("")
    for name, model in models.items():
        print("Training", name)

        model.fit(split[0], split[2])

        y_pred = model.predict(split[1])

        mse = mean_squared_error(split[3], y_pred)
        mae = mean_absolute_error(split[3], y_pred)
        r2 = r2_score(split[3], y_pred)
        print("MSE:", mse)
        # MAE = 0.5 -> This means that on average, the predicted target values are off by 0.5 units from the true target values.
        print("MAE:", mae)
        print("R2:", r2)
        print("")
    i+=1
    print("")
    print("")
    print("")

Predicting:  Duration

Training Linear Regression
MSE: 182.39945741274795
MAE: 4.481033578145158
R2: 0.037362345703156885

Training Logistic Regression
MSE: 196.978
MAE: 2.739
R2: -0.039577872422064786

Training Decision Tree Regressor
MSE: 200.368
MAE: 3.058
R2: -0.05746905309965733

Training Random Forest Regressor
MSE: 121.6217991
MAE: 2.9388400000000003
R2: 0.358124607070222

Training Random Forest Regressor 2
MSE: 119.673689968
MAE: 2.908152
R2: 0.3684060148756142

Training Support Vector Regressor
MSE: 194.939658354613
MAE: 2.75680514911276
R2: -0.02882025039838343

Training Support Vector Regressor 2
MSE: 195.82740241549192
MAE: 2.77791460448561
R2: -0.033505439008603144

Training Support Vector Regressor 3
MSE: 195.91473733920847
MAE: 2.847018486045495
R2: -0.033966360807917706

Training Support Vector Regressor 4
MSE: 195.8665378359016
MAE: 2.8053767302299275
R2: -0.03371198145032683

Training K-Nearest Neighbors Regressor
MSE: 167.07152000000002
MAE: 3.2798000000000003
R2: 0.

In [None]:
forest_param_grid = {
    'n_estimators': [100, 200],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [ 30,  50],
    'min_samples_split': [ 5, 10],
    'min_samples_leaf': [2, 4],
    'bootstrap': [True, False]
}

forest_model = RandomForestRegressor()

i = 0
for split in splits:
    print("Predicting: ", y[i])
    print("")


    grid_search = GridSearchCV(forest_model, forest_param_grid, cv=5)


    grid_search.fit(split[0], split[2])


    print("Best hyperparameters: ", grid_search.best_params_)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(split[1])
    mse = mean_squared_error(split[3], y_pred)
    mae = mean_absolute_error(split[3], y_pred)
    r2 = r2_score(split[3], y_pred)
    # MAE = 0.5 -> This means that on average, the predicted target values are off by 0.5 units from the true target values.

    print("MSE:", mse)
    print("MAE:", mae)
    print("R2:", r2)
    print("")
    i+=1



Predicting:  Duration



In [None]:
# param_grid = {
#     'hidden_layer_sizes': [(50, 25), (100, 50), (100,)],
#     'activation': ['relu', 'logistic', 'tanh'],
#     'solver': ['adam', 'sgd'],
#     'alpha': [0.0001, 0.001, 0.01],
#     'batch_size': [32, 64, 128],
#     'learning_rate': ['constant', 'adaptive'],
#     'max_iter': [2500, 5000],
#     'early_stopping': [True, False]
# }

param_grid = {
    'hidden_layer_sizes': [(100, 50)],
    'activation': ['relu', 'logistic', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [128],
    'learning_rate': ['adaptive'],
    'max_iter': [5000, 10000],
    'early_stopping': [True]
}


model = MLPRegressor()


grid_search = GridSearchCV(model, param_grid, cv=5)


grid_search.fit(X_train, y_train)


print("Best hyperparameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# MAE = 0.5 -> This means that on average, the predicted target values are off by 0.5 units from the true target values.

print("MSE:", mse)
print("MAE:", mae)
print("R2:", r2)

