In [1]:
# Import used libraries
from imblearn.over_sampling import SMOTE # solving imbalanced dataset
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier # model
import lightgbm as lgb # model

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import fbeta_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

# parametar tuning
from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
import numpy as np

In [2]:
# Dataset
train_numerical = pd.read_pickle("features_v9.pkl") # only numerical
train_date = pd.read_pickle("train_date_extended.pkl")
train_date = train_date[['Id', 'max', 'min', 'time_difference','min_time_station','max_time_station','path_length','part_week_start','part_week_end','day_of_week_start','day_of_week_end','parts_in_same_time','faults_next_1_hour','faults_next_10_hours','faults_next_24_hours','faults_last_1_hour','faults_last_10_hours','faults_last_24_hours']]
# Standardize date
train_date = train_date.fillna(train_date.mean())
for column in train_date.columns:
    if column == 'Id': continue
    train_date[column] = (train_date[column] - train_date[column].mean()) / train_date[column].std()
train_features = pd.merge(train_date, train_numerical)
train_features.columns = train_features.columns.astype(str)

# Include all faults
df = train_features.sample(n=round(len(train_features)/2))
faults = train_features[train_features['Response']==1]
df = pd.concat([faults,df])

# Replace nulls with mean
df = df.fillna(10)
X = df.drop(columns=['Id', 'Response'])
y = df['Response']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
X = X.replace(10, np.nan)
print(len(X), len(y))

598753 598753
1176860 1176860


In [3]:
from sklearn.ensemble import AdaBoostClassifier

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create and train the AdaBoost classifier
clf = AdaBoostClassifier(n_estimators=100, random_state=42)
X_train = X_train.fillna(X.mean())
clf.fit(X_train, y_train)

# make predictions on the test set
X_test = X_test.fillna(X.mean())
y_pred = clf.predict(X_test)

# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))

MCC : 0.8099
AUC Score: 0.903517


In [4]:
# PREDICT TRAINING DATA
df = df
df = df.fillna(np.nan)

# Predict
y_pred = clf.predict(df.drop(columns=['Id', 'Response']))
y_test = df['Response']
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

MCC : 0.1871
AUC Score: 0.629177
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98    588395
         1.0       0.15      0.29      0.20     10358

    accuracy                           0.96    598753
   macro avg       0.57      0.63      0.59    598753
weighted avg       0.97      0.96      0.97    598753



In [5]:
from sklearn.ensemble import RandomForestClassifier

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create and train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
X_train = X_train.fillna(X.mean())
clf.fit(X_train, y_train)

# make predictions on the test set
X_test = X_test.fillna(X.mean())
y_pred = clf.predict(X_test)

# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))

MCC : 0.6138
AUC Score: 0.801738


In [6]:
# PREDICT TRAINING DATA
df = df
df = df.fillna(np.nan)

# Predict
y_pred = clf.predict(df.drop(columns=['Id', 'Response']))
y_test = df['Response']
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

MCC : 0.1486
AUC Score: 0.705510
              precision    recall  f1-score   support

         0.0       0.99      0.85      0.92    588395
         1.0       0.06      0.56      0.11     10358

    accuracy                           0.85    598753
   macro avg       0.53      0.71      0.51    598753
weighted avg       0.97      0.85      0.90    598753



In [7]:
# LightGBM

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = lgb.LGBMClassifier(max_depth=10,num_iterations=100, boosting='gbdt',application='binary',metric='binary_logloss')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
        
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))



MCC : 0.8917
AUC Score: 0.945110


In [8]:
# PREDICT TRAINING DATA
df = df
df = df.fillna(np.nan)

# Predict
y_pred = model.predict(df.drop(columns=['Id', 'Response']))
y_test = df['Response']
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

MCC : 0.1312
AUC Score: 0.691938
              precision    recall  f1-score   support

         0.0       0.99      0.83      0.90    588395
         1.0       0.05      0.55      0.10     10358

    accuracy                           0.82    598753
   macro avg       0.52      0.69      0.50    598753
weighted avg       0.97      0.82      0.89    598753



In [3]:
from sklearn.tree import DecisionTreeClassifier
# In the first step we will split the data in training and remaining dataset
# X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state = 4)
# X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state = 4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit
clf = DecisionTreeClassifier(random_state=42)
X_train = X_train.fillna(X.mean())
clf.fit(X_train, y_train)
# Predict
X_test = X_test.fillna(X.mean())
y_pred = clf.predict(X_test)
    
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))

MCC : 0.9724
AUC Score: 0.986143


In [None]:
# PREDICT TRAINING DATA
df = df
df = df.fillna(np.nan)

# Predict
y_pred = clf.predict(df.drop(columns=['Id', 'Response']))
y_test = df['Response']
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create and train the Gradient Boosting classifier
clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
X_train = X_train.fillna(X.mean())
clf.fit(X_train, y_train)

# make predictions on the test set
X_test = X_test.fillna(X.mean())
y_pred = clf.predict(X_test)

# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))

MCC : 0.8372
AUC Score: 0.917490


In [None]:
# PREDICT TRAINING DATA
df = df
df = df.fillna(np.nan)

# Predict
y_pred = clf.predict(df.drop(columns=['Id', 'Response']))
y_test = df['Response']
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))