In [95]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score





In [96]:
worker_order_df = pd.read_csv('cleanworkorderoutliers.csv')


### Columns to encode
1. 'AffectedProduction'
2. ,'GrossProductionLoss',
3. 'Duration',
4. 'DaysFromLastFailure_int

In [97]:
outliers = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration', 'DaysFromLastFailure_int']].quantile([0.25,0.3 ,0.5,0.8, 0.9, 0.95, 0.99, 0.995, 0.999, 0.9999]).transpose()
outliers['max'] = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration','DaysFromLastFailure_int' ]].max()
outliers['mean'] = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration', 'DaysFromLastFailure_int']].mean()
outliers['min'] = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration', 'DaysFromLastFailure_int']].min()

print(outliers)

                         0.25   0.3    0.5    0.8     0.9    0.95    0.99  \
AffectedProduction        0.0   0.0    0.0    0.0     4.0    14.0    75.0   
GrossProductionLoss       0.0   0.0    0.0    0.0     0.0     0.0   100.0   
Duration                  1.0   1.0    1.0    2.0     7.0     7.0    93.0   
DaysFromLastFailure_int  24.0  36.0  127.0  597.0  1171.0  1936.0  3565.0   

                           0.995       0.999       0.9999        max  \
AffectedProduction        150.00     940.968    7000.0000  318455302   
GrossProductionLoss       300.00  594062.484  595199.2968     595336   
Duration                   93.00     259.484     621.6484     687389   
DaysFromLastFailure_int  3916.42    4216.000    4328.0000       5081   

                                mean  min  
AffectedProduction        671.439087    0  
GrossProductionLoss      1436.399883    0  
Duration                    9.072405    1  
DaysFromLastFailure_int   410.142711    1  


In [98]:
clean_worker_order_df = worker_order_df
AP_bins = [0.0,4.0, 14.0, 75.0, 150.0, 940.0, 7000.0, 318455302]
# Right = False -> Inclusive on both sides
clean_worker_order_df['AffectedProduction'] = pd.cut(clean_worker_order_df['AffectedProduction'], AP_bins, labels=range(len(AP_bins)-1), right=False)

GPL_bins = [0.0, 100.0, 300.0, 595336 ]
clean_worker_order_df['GrossProductionLoss'] = pd.cut(clean_worker_order_df['GrossProductionLoss'], GPL_bins, labels=range(len(GPL_bins)-1), right=False)

D_bins = [1.0, 2.0, 7.0, 93.0, 259.0, 621.0, 687389]
clean_worker_order_df['Duration'] = pd.cut(clean_worker_order_df['Duration'], D_bins, labels=range(len(D_bins)-1), right=False)

DLF_bins = [0.0 ,1.0, 24.0 , 36.0  ,127.0,  597.0 , 1171.0,  1936.0, 3916.0, 5081]
clean_worker_order_df['DaysFromLastFailure_int'] = pd.cut(clean_worker_order_df['DaysFromLastFailure_int'], DLF_bins, labels=range(len(DLF_bins)-1), right=False)


In [99]:
print(clean_worker_order_df.isna().sum())

SupervisorRole                0
TradeGroup                    0
IsAffectingProduction         0
AffectedProduction            1
GrossProductionLoss           1
Duration                      1
Safety                        0
WOType                        0
Reopened                      0
StatusCode                    0
EquipmentType                 0
FailureCount                  0
DayOfYear                     0
equipment_model_id_encoded    0
DaysFromLastFailure_int       3
dtype: int64


In [100]:
job_df_predict = clean_worker_order_df
job_df_predict = shuffle(job_df_predict, random_state=0)
job_df_predict = job_df_predict[0:10000]

X_wo_D = job_df_predict.loc[:, job_df_predict.columns != "Duration"]
X_wo_AP = job_df_predict.loc[:, job_df_predict.columns != "AffectedProduction"]
X_wo_GPL = job_df_predict.loc[:, job_df_predict.columns != "GrossProductionLoss"]
X_wo_DLF = job_df_predict.loc[:, job_df_predict.columns != "DaysFromLastFailure_int"]

Xs = [X_wo_D, X_wo_AP, X_wo_GPL, X_wo_DLF]
y = ["Duration","AffectedProduction","GrossProductionLoss","DaysFromLastFailure_int"]

X_D_train, X_D_test, y_D_train, y_D_test = train_test_split(Xs[0], job_df_predict['Duration'], test_size=0.2, random_state=42)
X_AP_train, X_AP_test, y_AP_train, y_AP_test = train_test_split(Xs[1], job_df_predict['AffectedProduction'], test_size=0.2, random_state=42)
X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test = train_test_split(Xs[2], job_df_predict['GrossProductionLoss'], test_size=0.2, random_state=42)
X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test = train_test_split(Xs[3], job_df_predict['DaysFromLastFailure_int'], test_size=0.2, random_state=42)

splits = [[X_D_train, X_D_test, y_D_train, y_D_test], [X_AP_train, X_AP_test, y_AP_train, y_AP_test], [X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test], [X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test]]



In [101]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Regressor": DecisionTreeClassifier(),
    "Random Forest Regressor": RandomForestClassifier(),
    "Support Vector Regressor": SVC(),
    "K-Nearest Neighbors Regressor": KNeighborsClassifier(),
}

i = 0
for split in splits:
    print("Predicting: ", y[i])
    print("")
    for name, model in models.items():
        print("Training", name)
        model.fit(split[0], split[2])

        y_pred = model.predict(split[1])
        accuracy = accuracy_score(split[3], y_pred)


        cm = confusion_matrix(split[3], y_pred)

    # print the confusion matrix and the model's name
        print("Confusion Matrix for {model}:")
        print(cm)
        print("Accuracy:", accuracy)
    i+=1
    print("")
    print("")
    print("")

Predicting:  Duration

Training Logistic Regression
Confusion Matrix for {model}:
[[1539   12    0    0    0]
 [ 209   12    0    0    0]
 [ 203    0    0    0    0]
 [  23    0    0    0    0]
 [   2    0    0    0    0]]
Accuracy: 0.7755
Training Decision Tree Regressor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix for {model}:
[[1318   86  141    4    2]
 [ 108   89   21    3    0]
 [ 139   17   43    4    0]
 [   7    2    2   12    0]
 [   2    0    0    0    0]]
Accuracy: 0.731
Training Random Forest Regressor
Confusion Matrix for {model}:
[[1500   36   14    1    0]
 [ 131   83    7    0    0]
 [ 181    7   14    1    0]
 [   4    1    0   18    0]
 [   2    0    0    0    0]]
Accuracy: 0.8075
Training Support Vector Regressor
Confusion Matrix for {model}:
[[1551    0    0    0    0]
 [ 221    0    0    0    0]
 [ 203    0    0    0    0]
 [  23    0    0    0    0]
 [   2    0    0    0    0]]
Accuracy: 0.7755
Training K-Nearest Neighbors Regressor
Confusion Matrix for {model}:
[[1466   41   37    7    0]
 [ 134   81    5    1    0]
 [ 187    7    9    0    0]
 [  16    1    0    6    0]
 [   2    0    0    0    0]]
Accuracy: 0.781



Predicting:  AffectedProduction

Training Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix for {model}:
[[1794    0    0    0    0    0    0]
 [  96    0    0    0    0    0    0]
 [  90    0    0    0    0    0    0]
 [  11    0    0    0    0    0    0]
 [   7    0    0    0    0    0    0]
 [   1    0    0    0    0    0    0]
 [   1    0    0    0    0    0    0]]
Accuracy: 0.897
Training Decision Tree Regressor
Confusion Matrix for {model}:
[[1764   17   13    0    0    0    0]
 [  25   35   31    2    3    0    0]
 [  19   31   35    2    3    0    0]
 [   3    2    5    0    1    0    0]
 [   0    3    3    0    1    0    0]
 [   0    0    1    0    0    0    0]
 [   0    0    0    0    1    0    0]]
Accuracy: 0.9175
Training Random Forest Regressor
Confusion Matrix for {model}:
[[1761   23   10    0    0    0    0]
 [  19   51   26    0    0    0    0]
 [  13   37   39    1    0    0    0]
 [   2    2    7    0    0    0    0]
 [   0    0    7    0    0    0    0]
 [   0    0    1    0    0    0    0]
 [   0    1    0    0    0    0    0]]
Accuracy: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix for {model}:
[[1976    0    1]
 [  17    0    0]
 [   6    0    0]]
Accuracy: 0.988
Training Support Vector Regressor
Confusion Matrix for {model}:
[[1977    0    0]
 [  17    0    0]
 [   6    0    0]]
Accuracy: 0.9885
Training K-Nearest Neighbors Regressor
Confusion Matrix for {model}:
[[1977    0    0]
 [  17    0    0]
 [   6    0    0]]
Accuracy: 0.9885



Predicting:  DaysFromLastFailure_int

Training Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix for {model}:
[[272   0   0 226   0   0   0   0]
 [ 32   0   0  61   0   0   0   0]
 [ 98   0   0 343   0   0   0   0]
 [ 73   0   0 518   0   0   0   0]
 [ 10   0   0 183   0   0   0   0]
 [  5   0   0  82   0   0   0   0]
 [  3   0   0  90   0   0   0   0]
 [  0   0   0   4   0   0   0   0]]
Accuracy: 0.395
Training Decision Tree Regressor
Confusion Matrix for {model}:
[[260  23  98  83  17  13   4   0]
 [ 24   6  23  26   8   4   2   0]
 [103  17 112 145  37  20   6   1]
 [107  23 117 224  71  31  16   2]
 [ 15   8  27  70  51  13   9   0]
 [ 11   2   7  28  18  14   5   2]
 [  5   1  12  21  20   7  27   0]
 [  1   0   0   0   0   0   1   2]]
Accuracy: 0.348
Training Random Forest Regressor
Confusion Matrix for {model}:
[[292   5  79 108   4   3   7   0]
 [ 27   0  29  27   7   1   2   0]
 [ 87   9 131 178  23   9   3   1]
 [ 81   8  87 346  44  16   8   1]
 [ 18   1  20 103  37   8   5   1]
 [  4   0   8  43  13   9   9   1]
 [  6   0   6  36  13   8  24   0]
 [  1