In [10]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, recall_score, precision_score
import matplotlib.pyplot as plt

In [5]:
worker_order_df = pd.read_csv('cleanworkorderoutliers.csv')

### Columns to encode
1. 'AffectedProduction'
2. ,'GrossProductionLoss',
3. 'Duration',
4. 'DaysFromLastFailure_int

In [6]:
outliers = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration', 'DaysFromLastFailure_int']].quantile([0.25,0.3 ,0.5,0.8, 0.9, 0.95, 0.99, 0.995, 0.999, 0.9999]).transpose()
outliers['max'] = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration','DaysFromLastFailure_int' ]].max()
outliers['mean'] = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration', 'DaysFromLastFailure_int']].mean()
outliers['min'] = worker_order_df[['AffectedProduction','GrossProductionLoss','Duration', 'DaysFromLastFailure_int']].min()

print(outliers)

                         0.25   0.3    0.5    0.8     0.9    0.95    0.99  \
AffectedProduction        0.0   0.0    0.0    0.0     4.0    14.0    75.0   
GrossProductionLoss       0.0   0.0    0.0    0.0     0.0     0.0   100.0   
Duration                  1.0   1.0    1.0    2.0     7.0     7.0    93.0   
DaysFromLastFailure_int  24.0  36.0  127.0  597.0  1171.0  1936.0  3565.0   

                           0.995       0.999       0.9999        max  \
AffectedProduction        150.00     940.968    7000.0000  318455302   
GrossProductionLoss       300.00  594062.484  595199.2968     595336   
Duration                   93.00     259.484     621.6484     687389   
DaysFromLastFailure_int  3916.42    4216.000    4328.0000       5081   

                                mean  min  
AffectedProduction        671.439087    0  
GrossProductionLoss      1436.399883    0  
Duration                    9.072405    1  
DaysFromLastFailure_int   410.142711    1  


In [7]:
clean_worker_order_df = worker_order_df
AP_bins = [0.0,4.0, 14.0, 75.0, 150.0, 940.0, 7000.0, 318455302]
# Right = False -> Inclusive on both sides
clean_worker_order_df['AffectedProduction'] = pd.cut(clean_worker_order_df['AffectedProduction'], AP_bins, labels=range(len(AP_bins)-1), right=False)

GPL_bins = [0.0, 100.0, 300.0, 595336]
clean_worker_order_df['GrossProductionLoss'] = pd.cut(clean_worker_order_df['GrossProductionLoss'], GPL_bins, labels=range(len(GPL_bins)-1), right=False)

D_bins = [1.0, 2.0, 7.0, 93.0, 259.0, 621.0, 687389]
clean_worker_order_df['Duration'] = pd.cut(clean_worker_order_df['Duration'], D_bins, labels=range(len(D_bins)-1), right=False)

DLF_bins = [0.0 ,1.0, 24.0 , 36.0  ,127.0,  597.0 , 1171.0,  1936.0, 3916.0, 5081]
clean_worker_order_df['DaysFromLastFailure_int'] = pd.cut(clean_worker_order_df['DaysFromLastFailure_int'], DLF_bins, labels=range(len(DLF_bins)-1), right=False)


In [8]:
job_df_predict = clean_worker_order_df
job_df_predict = shuffle(job_df_predict, random_state=0)
job_df_predict = job_df_predict[0:10000]

X_wo_D = job_df_predict.loc[:, job_df_predict.columns != "Duration"]
X_wo_AP = job_df_predict.loc[:, job_df_predict.columns != "AffectedProduction"]
X_wo_GPL = job_df_predict.loc[:, job_df_predict.columns != "GrossProductionLoss"]
X_wo_DLF = job_df_predict.loc[:, job_df_predict.columns != "DaysFromLastFailure_int"]

Xs = [X_wo_D, X_wo_AP, X_wo_GPL, X_wo_DLF]
y = ["Duration","AffectedProduction","GrossProductionLoss","DaysFromLastFailure_int"]

X_D_train, X_D_test, y_D_train, y_D_test = train_test_split(Xs[0], job_df_predict['Duration'], test_size=0.2, random_state=42)
X_AP_train, X_AP_test, y_AP_train, y_AP_test = train_test_split(Xs[1], job_df_predict['AffectedProduction'], test_size=0.2, random_state=42)
X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test = train_test_split(Xs[2], job_df_predict['GrossProductionLoss'], test_size=0.2, random_state=42)
X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test = train_test_split(Xs[3], job_df_predict['DaysFromLastFailure_int'], test_size=0.2, random_state=42)

splits = [[X_D_train, X_D_test, y_D_train, y_D_test], [X_AP_train, X_AP_test, y_AP_train, y_AP_test], [X_GPL_train, X_GPL_test, y_GPL_train, y_GPL_test], [X_DLF_train, X_DLF_test, y_DLF_train, y_DLF_test]]



In [18]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=20000, random_state=42),
    "Decision Tree Regressor": DecisionTreeClassifier(random_state=42),
    "Random Forest Regressor": RandomForestClassifier(random_state=42),
    "Support Vector Regressor": SVC(),
    "K-Nearest Neighbors Regressor": KNeighborsClassifier(),
}

i = 0
for split in splits:
    print("Predicting: ", y[i])
    print("")
    for name, model in models.items():
        print("Training", name)
        model.fit(split[0], split[2])

        y_pred = model.predict(split[1])

        accuracy = accuracy_score(split[3], y_pred)
        recall = recall_score(split[3], y_pred, average='micro')
        precision = precision_score(split[3], y_pred,  average='micro')

        cm = confusion_matrix(split[3], y_pred)

        print("Accuracy:", accuracy)
        print("Recall:", recall)
        print("Precision:", precision)

        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.show()

    i += 1
    print("")
    print("")
    print("")

Predicting:  Duration

Training Logistic Regression


KeyboardInterrupt: 