In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import logging
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


In [None]:
df1 = pd.read_csv("/content/drive/MyDrive/sensor.csv")

In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/sensor_high_freq.csv")

In [None]:
df3 = pd.read_csv("/content/drive/MyDrive/percent_reference.csv")

In [None]:
class ExploratoryDataAnalysis():
    def __init__(self,dataframe):
        self.dataframe = dataframe.copy()
        
    def _object_to_numeric(self):
        # converting to appropriate data type for the measurement columns
        rel_col = [x for x in self.dataframe.columns if x[0] =="B"]
        if 'Percent' in self.dataframe.columns:
            rel_col.append('Percent')
        for col in rel_col:
            self.dataframe[col] = pd.to_numeric(self.dataframe[col], errors="coerce")
    
    def _univariate_plots(self):
        num_col = self.dataframe.select_dtypes(exclude='object').columns
        for col in num_col:
            plt.figure(figsize=(15,5))
            sns.histplot(self.dataframe[col])
            plt.show()
        
    def _bivariate_plots(self):
        num_col = self.dataframe.select_dtypes(exclude='object').columns
        combs = combinations(num_col,2)
        for comb in combs:
            plt.figure(figsize=(10,5))
            sns.scatterplot(x=comb[0],y=comb[1],data=self.dataframe)
            plt.show()
    
    def _outlier_detection(self):
        num_col = self.dataframe.select_dtypes(exclude='object').columns
        for col in num_col:
            plt.figure(figsize=(10,5))
            sns.boxplot(self.dataframe[col])
            plt.show()

    def plot(self):
      self._object_to_numeric()
      self._univariate_plots()
      self._bivariate_plots()
      self._outlier_detection()
    

In [None]:
class DataPreprocessing():
    
    def __init__(self, df1,df2,df3):
        self.df1 = df1
        self.df2 = df2
        self.df3 = df3

    def clean_data(self):
        self.df1 = self.data_validation(self.df1)
        self.df1 = self.missing_values(self.df1)
        self.df2 = self.data_validation(self.df2)
        self.df2 = self.missing_values(self.df2)
        self.df3 = self.data_validation(self.df3)
        self.df3 = self.missing_values(self.df3)

    def data_validation(self, dataframe):
        # Checking if the sensor and controller measurement columns are numeric
        rel_col = [x for x in dataframe.columns if x[0] =="B"]
        if 'Percent' in dataframe.columns:
            rel_col.append('Percent')
        for col in rel_col:
            dataframe[col] = pd.to_numeric(dataframe[col], errors="coerce")
        return dataframe
  
            
    def missing_values(self, dataframe):
        # Tailored starategy for missing values based on the distribution of the variable
        
        # columns with a skewed distribution(based on univariate analysis)
        skew_col = ['B16','B17','B18','B_11','B_13']
        # columns with a single value
        single_value_col = ['B4','B5', 'B9', 'B10', 'B14', 'B20', 'B22', 'B23']
        
        normal_col = [x for x in dataframe.columns if x not in skew_col and x not in single_value_col]
        # Impute mean for normal columns, median for skewed and mode for single value columns
        num_col = dataframe.select_dtypes(exclude='object').columns
        for col in num_col:
            if col in skew_col:
                dataframe[col].fillna(dataframe[col].median(), inplace = True)
            if col in single_value_col:
                dataframe[col].fillna(dataframe[col].mode(), inplace = True)
            else:
                dataframe[col].fillna(dataframe[col].mean(), inplace = True)
        # Dropping incorrect rows in target variable column
        if 'Good/Bad' in dataframe.columns:
            dataframe = dataframe[(dataframe['Good/Bad'] == '0') | (dataframe['Good/Bad'] == '1')]                           
        return dataframe
    
    def merging_dataframes(self):
        # Generating a common column for joining the dataframes. This is needed as there are no columns to join with
        self.df2['ind'] = 1
        self.df3['ind'] = 1
        df4 = self.df2.merge(self.df3, on='ind',how='left')
        # Obtaining the appropriate join using the Percent constraint. Percent_Min < Percent < Percent_Max
        df4 = df4[(df4['Percent'] >= df4['Percent Min']) & (df4['Percent'] <= df4['Percent Max'])]
        # joining sensor and controller dataframe using timestamp
        df5 = df4.merge(self.df1, on='timestamp', how='inner')
        return df5
    
    def drop_redundant(self,df):
        cols = ['Cycle ID', 'Period Code', 'Percent Min','Percent Max','ind','timestamp']
        df.drop(cols, axis=1, inplace=True)
        return df
    
    def outlier_removal(self,df, threshold=0.01):
        # Choosing columns for outlier removal based on boxplots done in EDA
        outlier_cols = ['B_18','B_7','B_8','B_13']   # columns with significant outliers
        for col in outlier_cols:
            if col in df.columns:
                q1 = df[col].quantile(threshold/2)
                q2 = df[col].quantile(1-(threshold)/2)
                df = df[(df[col]<= q2) & (df[col]>=q1)]
        return df

    def run(self):
      self.clean_data()
      merged_df = self.merging_dataframes()
      abc_df = self.drop_redundant(merged_df)
      non_outlier_df = self.outlier_removal(abc_df)
      df_train, df_test = self.data_split(non_outlier_df)
      df_train_scaled, df_test_scaled = self.feature_scaling(df_train, df_test)
      X_train, y_train, X_test, y_test = self.train_test_splitting(df_train_scaled, df_test_scaled, 'Good/Bad')
      X_train,y_train,X_test,y_test = self.feature_selection(X_train, y_train, X_test, y_test)
      return X_train,y_train,X_test,y_test


        

    



        
    def data_split(self, df, test_size=0.2):
        """ Description: This method splits the dataframe into train and test data respectively
            using the sklearn's "train_test_split" method.
            Parameters: test_size: Percentage of the Dataframe to be taken as a test set
            returns: training and testing dataframes respectively.
        """
        df_train, df_test = train_test_split(df, test_size=test_size, shuffle=True, random_state=42)
        return df_train, df_test


    def feature_scaling(self, df_train,df_test):
        scaler = StandardScaler()
        num_col = df_train.select_dtypes(exclude= 'object').columns
        cat_col = [x for x in df_train.columns if x not in num_col]
        df_train_cat = df_train[cat_col]
        df_test_cat = df_test[cat_col]
        df_train[num_col] = scaler.fit_transform(df_train[num_col])
        df_train_scaled = pd.concat([df_train[num_col], df_train_cat],axis=1)
        df_test[num_col] = scaler.fit_transform(df_test[num_col])
        df_test_scaled = pd.concat([df_test[num_col], df_test_cat],axis=1)
        return df_train_scaled, df_test_scaled
        
        # return self.train_test_splitting(df_train_scaled, df_test_scaled,'Good/Bad')
    

    
        
  
    
    def train_test_splitting(self, df_train, df_test, column_name):
        """Description: This method splits the data into dependent and independent variables respectively
        i.e., X and y.
        Raises an exception if it fails
        parameters:
        df_train: A pandas dataframe representing the training data set
        df_test: A pandas dataframe representing the testing data set
        column_name: Target column or feature, which has to be predicted using other features
        returns:
        independent and dependent features of the both training and testing datasets respectively.
        i.e., df_train into X_train, y_train and df_test into X_test, y_test respectively.
        """

        
        X_train = df_train.drop(column_name, axis=1)
        y_train = df_train[column_name]
        X_test = df_test.drop(column_name, axis=1)
        y_test = df_test[column_name]
        return X_train, y_train, X_test, y_test

            
        
    def feature_selection(self,X_train,y_train,X_test,y_test):
      
        single_value_col = ['B_4','B_5', 'B_9', 'B_10', 'B_14', 'B_20', 'B_22', 'B_23']
        # remove all variables which have a constant value
        for col in single_value_col:
            if col in X_train.columns:
                X_train = X_train.drop(col,1)
                X_test = X_test.drop(col,1)
        # check correlation and remove highly correlated predictor variables
        corr_matrix = X_train.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
        X_train = X_train.drop(to_drop,1)
        X_test = X_test.drop(to_drop,1)
        return X_train,y_train,X_test,y_test


In [None]:
X_train, y_train, X_test, y_test = DataPreprocessing(df1, df2, df3).run()

  X_train = X_train.drop(col,1)
  X_test = X_test.drop(col,1)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
  X_train = X_train.drop(to_drop,1)
  X_test = X_test.drop(to_drop,1)


In [None]:
class Models:

    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def model_evaluation(self, prediction):
        score_acc = accuracy_score(self.y_test, prediction)
        print(f"Accuracy of the model is {score_acc}")
        score_f1 = f1_score(self.y_test,prediction, pos_label='1')
        print(f"F-1 score of the model is {score_f1}")

    def model_prediction(self, model, X):
        prediction = model.predict(X)
        return self.model_evaluation(prediction)

    def logistic_regressor(self):
        print("Evaluating Logistic Regression")
        lm = LogisticRegression()
        lm.fit(self.X_train, self.y_train)
        X = self.X_test
        return self.model_prediction(lm,X)
            
    def random_forest_classifier(self):
        print("Evaluating Random Forest")
        RF = RandomForestClassifier()
        params = {'n_estimators': [5, 10, 20, 40, 80, 100, 200],
                      'max_depth': [2, 5, 10, 20],
                      'min_samples_split': [2, 4, 8, 12],
                      'oob_score': [True]}

            # instantiating RandomizedSearchCV
        RCV = RandomizedSearchCV(estimator=RF,
                                     param_distributions=params,
                                     n_iter=5,
                                     scoring='r2',
                                     cv=10,
                                     verbose=5,
                                     random_state=42,
                                     n_jobs=-1,
                                     return_train_score=True)

        # Fitting on the train data
        RCV.fit(self.X_train, self.y_train)
        RF = RCV.best_estimator_

        # fitting on the train data
        RF.fit(self.X_train, self.y_train)
        X = self.X_test
        return self.model_prediction(RF,X)

        
    def xgboost(self):
        print("Evaluating XGBoost")
        xgb = XGBClassifier(booster='gbtree', sub_sample=0.8)
        params = {'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],'min_child_weight' : [ 1, 3, 5, 7 ],
             'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
             'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]} 
        RCV = RandomizedSearchCV(estimator=xgb,
                                     param_distributions=params,
                                     n_iter=5,
                                     scoring='r2',
                                     cv=10,
                                     verbose=5,
                                     random_state=42,
                                     n_jobs=-1,
                                     return_train_score=True)
        RCV.fit(self.X_train, self.y_train)
        xgb = RCV.best_estimator_

      # fitting on the train data
        xgb.fit(self.X_train, self.y_train)
        X = self.X_test
        return self.model_prediction(xgb,X)

    def svc(self):
        print("Evaluating Support Vector Classifier")
        svc = svm.SVC()
        param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
        RCV = RandomizedSearchCV(estimator=svc,
                                     param_distributions=param_grid,
                                     n_iter=5,
                                     scoring='r2',
                                     cv=10,
                                     verbose=5,
                                     random_state=42,
                                     n_jobs=-1,
                                     return_train_score=True)
        RCV.fit(self.X_train, self.y_train)
        svc = RCV.best_estimator_
        
        svc.fit(self.X_train, self.y_train)
        X = self.X_test
        return self.model_prediction(svc,X)
        
    
    def evaluate(self):
      self.logistic_regressor()
      self.random_forest_classifier()
      self.xgboost()
      self.svc()

In [None]:
Models( X_train, y_train, X_test, y_test).evaluate()

Evaluating Logistic Regression
Accuracy of the model is 0.9573630698589701
F-1 score of the model is 0.8271276595744682
Evaluating Random Forest
Fitting 10 folds for each of 5 candidates, totalling 50 fits
Accuracy of the model is 0.959986880944572
F-1 score of the model is 0.8342391304347826
Evaluating XGBoost
Fitting 10 folds for each of 5 candidates, totalling 50 fits
Accuracy of the model is 0.9596589045588717
F-1 score of the model is 0.8362183754993342
Evaluating Support Vector Classifier
Fitting 10 folds for each of 5 candidates, totalling 50 fits
Accuracy of the model is 0.9616267628730731
F-1 score of the model is 0.8425302826379543
