In [1]:
# Import used libraries
from imblearn.over_sampling import SMOTE # solving imbalanced dataset
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier # model
import lightgbm as lgb # model

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import fbeta_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

# parametar tuning
from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
import numpy as np

In [3]:
# Random forrest
train_numerical = pd.read_pickle("features_v10.pkl") # only numerical
train_date = pd.read_pickle("train_date_features.pkl")
# Standardize date
train_date = train_date.fillna(train_date.mean())
for column in train_date.columns:
    if column == 'Id': continue
    train_date[column] = (train_date[column] - train_date[column].mean()) / train_date[column].std()
train_features = pd.merge(train_date, train_numerical)
train_features.columns = train_features.columns.astype(str)

# Replace nulls with mean
train_features = train_features.fillna(train_features.mean())

# Include all faults
df = train_features.sample(n=100000)
faults = train_features[train_features['Response']==1]
df = pd.concat([faults,df])

X = df.drop(columns=['Id', 'Response'])
y = df['Response']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

test = train_features.sample(n=20000)
X_test = test.drop(columns=['Id', 'Response'])
y_test = test['Response']

model = RandomForestClassifier(max_depth=10, random_state=0)
model.fit(X, y)

# Predict
y_pred = model.predict(X_test)
        
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))

106879 106879
198908 198908
MCC : 0.06539
AUC Score: 0.617734


In [2]:
# LightGBM
# Random forrest
train_numerical = pd.read_pickle("features_v10.pkl") # only numerical
train_date = pd.read_pickle("train_date_features.pkl")
# Standardize date
train_date = train_date.fillna(train_date.mean())
for column in train_date.columns:
    if column == 'Id': continue
    train_date[column] = (train_date[column] - train_date[column].mean()) / train_date[column].std()
train_features = pd.merge(train_date, train_numerical)
train_features.columns = train_features.columns.astype(str)

# Replace nulls with mean
train_features = train_features.fillna(train_features.mean())

# Include all faults
df = train_features.sample(n=100000)
faults = train_features[train_features['Response']==1]
df = pd.concat([faults,df])

X = df.drop(columns=['Id', 'Response'])
y = df['Response']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

test = train_features.sample(n=20000)
X_test = test.drop(columns=['Id', 'Response'])
y_test = test['Response']

model = lgb.LGBMClassifier(max_depth=10,num_iterations=100, boosting='gbdt',application='binary',metric='binary_logloss')
model.fit(X, y)

# Predict
y_pred = model.predict(X_test)
        
# Print model report:
print("MCC : %.4g" % matthews_corrcoef(y_test, y_pred))
print("AUC Score: %f" % metrics.roc_auc_score(y_test, y_pred))

# MCC : 0.1033
# AUC Score: 0.550648

106879 106879
198822 198822




MCC : 0.1496
AUC Score: 0.576547
