In [1]:
#Load and Setup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import plot_roc_curve

from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('machine_event.csv', sep = ';')
df['date'] = pd.to_datetime(df.date, format="%m/%d/%Y")

In [4]:
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [5]:
X = df.drop(['event', 'machine', 'date'], axis=1)
y = df.event

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify = y)

In [7]:
model1 = sm.OLS(y, X)
result=model1.fit()
print(result.summary())

Select only features with p < 0.05, 1 by 1 by eliminating the highest p value

In [12]:
model2 = sm.OLS(y, X[['feature1', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8', 'feature9']])
result2 = model2.fit()
print(result2.summary())

                                 OLS Regression Results                                
Dep. Variable:                  event   R-squared (uncentered):                   0.020
Model:                            OLS   Adj. R-squared (uncentered):              0.020
Method:                 Least Squares   F-statistic:                              357.2
Date:                Wed, 26 May 2021   Prob (F-statistic):                        0.00
Time:                        23:47:27   Log-Likelihood:                      2.6459e+05
No. Observations:              124494   AIC:                                 -5.292e+05
Df Residuals:                  124487   BIC:                                 -5.291e+05
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
model3 = sm.OLS(y, X[['feature1', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8', 'feature9']])
result3 = model3.fit()
print(result3.summary())

                                 OLS Regression Results                                
Dep. Variable:                  event   R-squared (uncentered):                   0.020
Model:                            OLS   Adj. R-squared (uncentered):              0.020
Method:                 Least Squares   F-statistic:                              416.4
Date:                Wed, 26 May 2021   Prob (F-statistic):                        0.00
Time:                        23:48:00   Log-Likelihood:                      2.6458e+05
No. Observations:              124494   AIC:                                 -5.292e+05
Df Residuals:                  124488   BIC:                                 -5.291e+05
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Eliminated : feature 2 and 3

In [15]:
df2 = df[['feature1', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8', 'feature9', 'event']]
df2

Unnamed: 0,feature1,feature4,feature5,feature6,feature7,feature8,feature9,event
0,0,215630672,6,0,52,56,0,0
1,3,61370680,6,0,0,0,0,0
2,0,173295968,12,0,0,0,0,0
3,0,79694024,6,0,0,0,0,0
4,0,135970480,15,0,0,0,0,0
...,...,...,...,...,...,...,...,...
124489,0,18310224,10,8,0,0,12,0
124490,107,172556680,11,0,4,96,0,0
124491,0,19029120,11,0,0,4832,0,0
124492,0,226953408,12,0,0,0,0,0


# Multicolinearity

In [16]:
df2.corr(method='pearson')

Unnamed: 0,feature1,feature4,feature5,feature6,feature7,feature8,feature9,event
feature1,1.0,0.003702,-0.006696,-0.001884,0.097452,-0.002617,-0.001884,-0.000948
feature4,0.003702,1.0,-0.00337,0.000151,0.001837,-0.004248,0.000151,0.001984
feature5,-0.006696,-0.00337,1.0,-0.009384,-0.009773,-0.013999,-0.009384,0.00227
feature6,-0.001884,0.000151,-0.009384,1.0,0.045631,0.141367,1.0,0.119055
feature7,0.097452,0.001837,-0.009773,0.045631,1.0,0.146593,0.045631,0.067398
feature8,-0.002617,-0.004248,-0.013999,0.141367,0.146593,1.0,0.141367,0.052902
feature9,-0.001884,0.000151,-0.009384,1.0,0.045631,0.141367,1.0,0.119055
event,-0.000948,0.001984,0.00227,0.119055,0.067398,0.052902,0.119055,1.0


Eliminate either features that correlated to each other. close to -1 or 1

safe.

In [17]:
X = df2.drop(['event'], axis=1)
y = df2.event

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify = y)

In [21]:
sscaler = StandardScaler()
X_train_sscaled = sscaler.fit_transform(X_train)
X_test_sscaled = sscaler.transform(X_test)

In [22]:
#instiate logreg model
logreg = LogisticRegression()

# fit logreg to the train set
logreg.fit(X_train_sscaled, y_train)
logregpred = logreg.predict(X_test_sscaled)
logregperf = logreg.score(X_test_sscaled, y_test)*100
logregperf

99.91003727027375

In [23]:
confusion_matrix(y_test, logregpred)

array([[31096,     1],
       [   27,     0]], dtype=int64)

In [25]:
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

In [26]:
X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X_sm,
                                                    y_sm,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify = y_sm)

In [27]:
sscaler = StandardScaler()
X_train_smsscaled = sscaler.fit_transform(X_sm_train)
X_test_smsscaled = sscaler.transform(X_sm_test)

In [28]:
logregsm = LogisticRegression()

#fit logreg to the train set
logregsm.fit(X_train_smsscaled, y_sm_train)
logregsmpred = logregsm.predict(X_test_smsscaled)
logregsmperf = logregsm.score(X_test_smsscaled, y_sm_test)*100
logregsmperf

83.2703593536458

In [29]:
confusion_matrix(y_sm_test, logregsmpred)

array([[24173,   705],
       [ 7619, 17259]], dtype=int64)

In [30]:
print(classification_report(y_sm_test, logregsmpred))

              precision    recall  f1-score   support

           0       0.76      0.97      0.85     24878
           1       0.96      0.69      0.81     24878

    accuracy                           0.83     49756
   macro avg       0.86      0.83      0.83     49756
weighted avg       0.86      0.83      0.83     49756



In [31]:
X_train_sscaled2 = sscaler.transform(X_train)
X_test_sscaled2 = sscaler.transform(X_test)

In [32]:
realpred1 = logregsm.predict(X_test_sscaled2)

In [33]:
print(classification_report(y_test, realpred1))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99     31097
           1       0.01      0.48      0.03        27

    accuracy                           0.97     31124
   macro avg       0.51      0.73      0.51     31124
weighted avg       1.00      0.97      0.98     31124



# the model did not improve after eliminating unsignificant features