In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, plot_roc_curve, classification_report
from sklearn import model_selection

In [2]:
#Load Dataframes of the different component
rooth_path = '../data/'
generator_train = pd.read_csv(rooth_path + 'generator_train.csv')
gen_bear_train = pd.read_csv(rooth_path + 'gen_bear_train.csv')
transformer_train = pd.read_csv(rooth_path + 'transformer_train.csv')
hydraulic_train = pd.read_csv(rooth_path + 'hydraulic_train.csv')
gearbox_train = pd.read_csv(rooth_path + 'gearbox_train.csv')
generator_test = pd.read_csv(rooth_path + 'generator_test.csv')
gen_bear_test = pd.read_csv(rooth_path + 'gen_bear_test.csv')
transformer_test = pd.read_csv(rooth_path + 'transformer_test.csv')
hydraulic_test = pd.read_csv(rooth_path + 'hydraulic_test.csv')
gearbox_test = pd.read_csv(rooth_path + 'gearbox_test.csv')

### We are ready to build our base-model for each component. 

- We will need to scale the features, within each turbine
- Since we have a very unbalanced dataset we will use SMOTE for balancing the training data set
- Reach a base line model
- Try other algorithms and fine tune
- Probably we will need to change the threshold to reduce the FP
- Calculate the different costs of our best models

In [3]:
# Scale within each turbine
def scale (df_train, df_test, scaler='StandardScaler'):
    
    '''Scale within each given turbine
    
    Args:
            df_train      : Train datarame
            df_test       : Test dataframe
        
    Returns:
            array         : Scaled array of train and test'''
    
    
    # Scale for turbine T01 first
    X_train1 = df_train.loc[df_train['Turbine_ID']=='T01']
    X_test1 = df_test.loc[df_test['Turbine_ID']=='T01']

    X_train1 = X_train1.drop(columns=['Turbine_ID', 'Date', 'TTF', 'Failure'])
    X_test1 = X_test1.drop(columns=['Turbine_ID', 'Date', 'TTF', 'Failure'])
    
    if scaler == 'MinMaxScaler':
        sc = MinMaxScaler()
        X_train1 = sc.fit_transform(X_train1)
        X_test1 = sc.transform(X_test1) 
    else:
        sc = StandardScaler()
        X_train1 = sc.fit_transform(X_train1)
        X_test1 = sc.transform(X_test1) 
    
    # Scale on other turbines
    turbines = ['T06', 'T07', 'T09', 'T11']
    for turbine in turbines:
        X_train_ = df_train.loc[df_train['Turbine_ID']==turbine]
        X_test_ = df_test.loc[df_test['Turbine_ID']==turbine]

        X_train_ = X_train_.drop(columns=['Turbine_ID', 'Date', 'TTF', 'Failure'])
        X_test_ = X_test_.drop(columns=['Turbine_ID', 'Date', 'TTF', 'Failure'])

        if scaler == 'MinMaxScaler':
            sc = MinMaxScaler()
            X_train_ = sc.fit_transform(X_train_)
            X_test_ = sc.transform(X_test_)
        else:
            sc = StandardScaler()
            X_train_ = sc.fit_transform(X_train_)
            X_test_ = sc.transform(X_test_)
        
        # Concatenate
        X_train1 = np.concatenate((X_train1, X_train_))
        X_test1 = np.concatenate((X_test1, X_test_))
        
             
    return X_train1, X_test1

In [4]:
# Data augmentation using SMOTE
def data_aug(X_train, y_train):
    
    '''Function for data augmentation, using SMOTE
    
    
    '''
    over = SMOTE(random_state=0)
    
    X_train_over, y_train_over = over.fit_sample(X_train, y_train)
    
    
    return X_train_over, y_train_over

In [5]:
# Function to predict
def bin_classify(model, clf, X_train, X_test, y_train, y_test, params=None, score=None, ):
    
    #Grid Search
    grid_search = model_selection.GridSearchCV(estimator=clf, param_grid=params, cv=5, scoring=score, n_jobs=-1)

    grid_search.fit(X_train, y_train)
    train_pred = grid_search.predict(X_train)
    y_pred = grid_search.predict(X_test)
    
    if hasattr(grid_search, 'predict_proba'):   
        y_score = grid_search.predict_proba(X_test)[:,1]
    elif hasattr(grid_search, 'decision_function'):
        y_score = grid_search.decision_function(X_test)
    else:
        y_score = y_pred
    
    #Predictions
    predictions = {'y_pred' : y_pred, 'y_score' : y_score}
    df_predictions = pd.DataFrame.from_dict(predictions)
    
    return grid_search.best_estimator_, df_predictions

In [6]:
# Function for metrics
def metrics(estimator, X_test, y_test, y_pred, label):
    
    # Classification Report
    print(classification_report(y_test, y_pred))
    # Confusion MAtrix
    print(confusion_matrix(y_test, y_pred))
    #Plot Roc
    plot_roc_curve(estimator, X_test, y_test, name = f'{estimator} {label}')

In [7]:
# Logistic Regression
def logreg(X_train, X_test, y_train, y_test, label):
    model = 'Logistic Regression'
    clf_logreg = LogisticRegression(random_state=42, max_iter=3000, n_jobs=-1)
    gs_params = {'C': [.01, 0.1, 1.0, 10], 'solver': ['liblinear', 'lbfgs']}
    gs_score = 'f1'

    clf_logreg, pred_logreg = bin_classify(model, clf_logreg, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_logreg)
    
    metrics(clf_logreg, X_test, y_test, pred_logreg['y_pred'], label)
    
    return clf_logreg, pred_logreg

In [8]:
# Random Forest Classifier
def rfc(X_train, X_test, y_train, y_test, label):
    model = 'Random Forest Classifier'
    clf_rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
    gs_params = {'n_estimators': [800, 900, 1000, 1300, 1400, 1500], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', None]}
    gs_score = 'f1'

    clf_rfc, pred_rfc = bin_classify(model, clf_rfc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_rfc)
    
    metrics(clf_rfc, X_test, y_test, pred_rfc['y_pred'], label)
    
    return clf_rfc, pred_rfc

In [9]:
# Gradient Boosting Classifier
def gbc(X_train, X_test, y_train, y_test, label):
    model = 'Gradient Boosting Classifier'
    clf_gbc = GradientBoostingClassifier(random_state=42)
    gs_params = {'learning_rate': [0.001, 0.01, 0.1, 0.5, 1], 'n_estimators': [100, 200, 500, 700]}
    gs_score = 'f1'

    clf_gbc, pred_gbc= bin_classify(model, clf_gbc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_gbc)

    metrics(clf_gbc, X_test, y_test, pred_gbc['y_pred'], label)
    
    return clf_gbc, pred_gbc

In [10]:
# K-Neighbors
def knn(X_train, X_test, y_train, y_test, label):
    model = 'KNN'
    clf_knn = KNeighborsClassifier(n_jobs=-1)
    gs_params = {'n_neighbors': [5, 7, 10, 15]}
    gs_score = 'f1'

    clf_knn, pred_knn= bin_classify(model, clf_knn, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_knn)

    metrics(clf_knn, X_test, y_test, pred_knn['y_pred'], label)
    
    return clf_knn, pred_knn

In [11]:
# Ada Boost Classifier
def abc(X_train, X_test, y_train, y_test, label):
    model = 'AdaBoostClassifier'
    clf_abc = AdaBoostClassifier(random_state=42)
    gs_params = {'n_estimators': [50, 100, 200, 300, 500]}
    gs_score = 'f1'

    clf_abc, pred_abc= bin_classify(model, clf_abc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_abc)

    metrics(clf_abc, X_test, y_test, pred_abc['y_pred'], label)
    
    return clf_abc, pred_abc

In [12]:
# SVC
def svc(X_train, X_test, y_train, y_test, label):
    model = 'SVC'
    clf_svc = SVC(random_state=42)
    gs_params = {'C': [0.01, 0.1, 1, 1.2], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [2,3], 'class_weight': ['balanced', None], 'gamma': ['auto', 'scale']}
    gs_score = 'f1'

    clf_svc, pred_svc= bin_classify(model, clf_svc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_svc)

    metrics(clf_svc, X_test, y_test, pred_svc['y_pred'], label)
    
    return clf_svc, pred_svc

In [13]:
# XGBoost
def xgboost(X_train, X_test, y_train, y_test, label):
    
    model = 'XGBoost'
    clf_xgb = XGBClassifier()
    gs_params = {'colsample_bytree': [0.3, 0.7], 'learning_rate': [0.01, 0.1, 0.2, 0.5], 'n_estimators': [50, 100, 200, 300], 'subsample': [0.2, 0.5, 0.8], 'max_depth': [2, 3, 5]}
    gs_score = 'f1'

    clf_xgb, pred_xgb= bin_classify(model, clf_xgb, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_xgb)

    metrics(clf_xgb, X_test, y_test, pred_xgb['y_pred'], label)
    
    return clf_xgb, pred_xgb

# GENERATOR

In [14]:
# By exploration, drop columns:
gen_col_drop = ['Gen_RPM_Min', 'Gen_RPM_Min_av', 'Gen_RPM_Min_sd', 'Gen_RPM_Avg', 'Gen_RPM_Avg_av', 'Gen_RPM_Avg_sd',
                'Gen_Phase1_Temp_Avg', 'Gen_Phase1_Temp_Avg_av', 'Gen_Phase1_Temp_Avg_sd',
                'Gen_Phase3_Temp_Avg', 'Gen_Phase3_Temp_Avg_av', 'Gen_Phase3_Temp_Avg_sd',
                'Blds_PitchAngle_Min', 'Blds_PitchAngle_Min_av', 'Blds_PitchAngle_Min_sd',
                'Blds_PitchAngle_Avg', 'Blds_PitchAngle_Avg_av', 'Blds_PitchAngle_Avg_sd',
                'Amb_WindSpeed_Avg', 'Amb_WindSpeed_Avg_av', 'Amb_WindSpeed_Avg_sd',
                'Amb_WindSpeed_Std', 'Amb_WindSpeed_Std_av', 'Amb_WindSpeed_Std_sd',
                'Cont_Hub_Temp_Avg', 'Cont_Hub_Temp_Avg_av', 'Cont_Hub_Temp_Avg_sd',
                'Cont_Top_Temp_Avg', 'Cont_Top_Temp_Avg_av', 'Cont_Top_Temp_Avg_sd',
                'Cont_VCP_Temp_Avg', 'Cont_VCP_Temp_Avg_av', 'Cont_VCP_Temp_Avg_sd',
                'Nac_Temp_Avg', 'Nac_Temp_Avg_av', 'Nac_Temp_Avg_sd',
                'Nac_Direction_Avg', 'Nac_Direction_Avg_av', 'Nac_Direction_Avg_sd',
                'Grd_Busbar_Temp_Avg', 'Grd_Busbar_Temp_Avg_av', 'Grd_Busbar_Temp_Avg_sd']

In [15]:
# Drop columns
generator_train = generator_train.drop(columns=gen_col_drop)
generator_test = generator_test.drop(columns=gen_col_drop)

In [20]:
# Scale
X_train_scale, X_test_scale = scale(generator_train, generator_test)
y_train = generator_train['Failure']
y_test = generator_test['Failure']

In [16]:
pred_logreg_gen = {}
pred_rfc_gen = {}
pred_gbc_gen = {}
pred_knn_gen = {}
pred_abc_gen = {}
pred_svc_gen = {}
pred_xgb_gen = {}