# **SAVE THE ATTACK**

*   **Designed classification model to correctly identify vulnerable machine**
*   **Performed Exploratory Data Analysis and preprocessing on big data having 7.1 million data rows followed by classificaIon models like Decision Tree and Random Forest**
*   **Implemented Bagging and Boosting techniques that helped to achieve 62% accuracy**

*Designed and Developed By: Ashutosh Soni*

# Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import sklearn.model_selection as model_selection
import sklearn.linear_model as linear_model
import sklearn.svm as svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as metric
import sklearn.preprocessing as preprocessing
from scipy.linalg import svd
import pickle
import missingno as msno

from sklearn.ensemble import AdaBoostClassifier

from sklearn.decomposition import IncrementalPCA

import tensorflow as tf
tf.__version__

'2.6.0'

# Data Preprocessing

**Definition of pre-processing class that helps to perform some necessary tasks to clean dataset. Some of the techniques implemented are as follows:**


1.   Removing unneccesary columns
2.   Missing value management
3.   Removal of duplicate entried
4.   Transformation of Data Ditribution
5.   Outlier Removal
6.   PCA



In [None]:
class preproc:
    ## Outlier Removal class
    class OutlierRemoval: 
        def __init__(self, lower_quartile, upper_quartile):
            self.lower_whisker = lower_quartile - 1.5*(upper_quartile - lower_quartile)
            self.upper_whisker = upper_quartile + 1.5*(upper_quartile - lower_quartile)
        
        def removeOutlier(self, x):
            return (x if x <= self.upper_whisker and x >= self.lower_whisker else (self.lower_whisker if x < self.lower_whisker else (self.upper_whisker)))
    
    def __init__(self):
        self.cols_to_drop=[]
        self.l_encoder={}
        ## Observed categorical attributes of the data
        self.categorical_cols=['EngineVersion', 'AppVersion', 'AvSigVersion', 'OsPlatformSubRelease', 'OsBuildLab', 
                               'SkuEdition', 'Census_MDC2FormFactor', 'Census_PrimaryDiskTypeName', 
                               'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'Census_OSVersion', 
                               'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName', 
                               'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 
                               'Census_ActivationChannel', 'Wdft_IsGamer']
        self.std_scaler=preprocessing.StandardScaler()
        self.pca=IncrementalPCA(68, whiten=True)
        self.robScaler=preprocessing.RobustScaler()
        self.qTransform=preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
        
        for col in self.categorical_cols:
            self.l_encoder[col]=LabelEncoder()        
    
    ## preprocessing helper function for train data
    def train_preprocess(self, df):
        self.cols_to_drop=['MachineIdentifier', 'ProductName', 'IsBeta', 'HasTpm', 'DefaultBrowsersIdentifier', 'OrganizationIdentifier', 
                           'PuaMode', 'SmartScreen', 'Census_ProcessorClass', 'Census_InternalBatteryType', 
                           'Census_IsFlightingInternal', 'Census_ThresholdOptIn', 'Census_IsWIMBootEnabled', 'SMode',
                           'Platform', 'OsVer', 'Processor', 'IsProtected', 'AutoSampleOptIn', 'Firewall', 'UacLuaenable',
                           'Census_DeviceFamily', 'Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity',
                           'Census_TotalPhysicalRAM', 'Census_OSArchitecture', 'Census_IsPortableOperatingSystem', 
                           'Census_IsFlightsDisabled', 'Census_FlightRing', 'Census_IsVirtualDevice', 'Census_IsPenCapable',
                           'Census_IsAlwaysOnAlwaysConnectedCapable'
                          ]
        df.drop(axis='columns', labels=self.cols_to_drop, inplace=True)
        print("shape after dropping unnecesary columns: ", df.shape)
        
        ## Handling missing values and categorical valued attributes
        print("Attributes count that have missing values before missing value handle: ")
        count=0
        for col in df.columns:
            if(df[col].isna().sum()>0):
                count+=1
        print("Total attributes that have missing values: ", count)

        for col in df.columns:
            if(col!='HasDetections'):
                if(col not in self.categorical_cols):
                    df[col].fillna(df[col].median(), inplace=True)
                else:
                    df[col]=self.l_encoder[col].fit_transform(df[col])
        
        df_X=df.drop(axis='columns', labels=['HasDetections'])
        df_Y=df['HasDetections']
        print(df_X.shape, df_Y.shape)
        
        df_X_cont=df_X.drop(axis='columns', labels=self.categorical_cols)
        df_X_cat=df_X[self.categorical_cols]
        print(df_X_cont.shape)
        print(df_X_cat.shape)
        
        ## checking whether attributes having type as "object"
        count=0
        for col in df_X.columns:
            if(df_X[col].dtype=='object'):
                count+=1
        print("Attribute counts that have type as Object: ", count)
        
        ## Outlier Removal [Need research]
        for col in df_X_cont.columns:
            if(col!='HasDetections'):
                outlierRem = self.OutlierRemoval(df_X_cont[col].quantile(0.25), df_X_cont[col].quantile(0.75))
                df_X_cont[col] = df_X_cont[col].apply(outlierRem.removeOutlier)

        ## Standardisation of continuous valued attributes
        df_X_cont_scaled=self.robScaler.fit_transform(df_X_cont)
        df_X_cont_scaled=self.qTransform.fit_transform(df_X_cont_scaled)
        df_X_cont_scaled=pd.DataFrame(df_X_cont_scaled, index=df_X_cont.index, columns=df_X_cont.columns)

        df_X=pd.concat([df_X_cont_scaled, df_X_cat], axis=1)
        
        # ## PCA decomposition of train data
        # self.pca.partial_fit(df_X_scaled)

        df=pd.concat([df_X, df_Y], axis=1)

        print("Data shape after preprocessing: ", df.shape)

        ## data and label spliting from the dataset
        df_X = df.drop(axis='columns', labels=['HasDetections'])
        df_Y = df[['HasDetections']]
        print(df_X.shape, df_Y.shape)

        x_train, x_test, y_train, y_test = model_selection.train_test_split(df_X, df_Y, train_size=0.8, stratify=df_Y)

        return x_train, x_test, y_train, y_test
    
    ## preprocessing helper function for test data
    def test_preprocess(self, df_test):
        df_test.drop(axis='columns', labels=self.cols_to_drop, inplace=True)
        print("shape after dropping unnecesary columns: ", df_test.shape)
        
        for col in df_test.columns:
            if(len(df_test[col].unique())!=3 and df_test[col].dtype!='object'):
                df_test[col].fillna(df_test[col].median(), inplace=True)
            else:
                df_test[col]=self.l_encoder[col].fit_transform(df_test[col])
        
        df_test_cont=df_test.drop(axis='columns', labels=self.categorical_cols)
        df_test_cat=df_test[self.categorical_cols]
        print(df_test_cont.shape)
        print(df_test_cat.shape)
        
        for col in df_test_cont.columns:
            if(col!='HasDetections'):
                outlierRem = self.OutlierRemoval(df_test_cont[col].quantile(0.25), df_test_cont[col].quantile(0.75))
                df_test_cont[col] = df_test_cont[col].apply(outlierRem.removeOutlier)
                
        ## Standardisation of continuous valued attributes
        df_test_cont_scaled=self.robScaler.transform(df_test_cont)
        df_test_cont_scaled=self.qTransform.transform(df_test_cont)
        df_test_cont_scaled=pd.DataFrame(df_test_cont_scaled, index=df_test_cont.index, columns=df_test_cont.columns)

        df_test_scaled=pd.concat([df_test_cont_scaled, df_test_cat], axis=1)
        
        return df_test_scaled.to_numpy()
        

# Preprocessing Initialisation

In [None]:
pre=preproc()

**As Dataset has more than 7.1 million data rows, we follow chunk-wise data loading and preprocessing**

So, here I loaded and preprocessed data in chunk size of 50,000

In [None]:
# filename = 'model_lr.sav'

X_train=[]
X_test=[]
Y_train=[]
Y_test=[]

count=0

for chunk_of_df in pd.read_csv("save-the-attack-contest/train_data.csv", chunksize=50000):
    print(chunk_of_df.shape)
    
    x_train, x_test, y_train, y_test = pre.train_preprocess(chunk_of_df)
    
    X_train.append(x_train)
    X_test.append(x_test)
    Y_train.append(y_train)
    Y_test.append(y_test)
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    count+=1

# Training model declaration and training

In [None]:
model_LR=linear_model.LogisticRegression(max_iter=500, verbose=2, class_weight='balanced')

model_svm_linear=svm.SVC(kernel="linear", probability=True, C=0.1)

model_svm_rbf=svm.SVC(kernel="rbf", probability=True, class_weight='balanced')

model_BNB=BernoulliNB()

clf_dt=DecisionTreeClassifier(random_state=0, class_weight='balanced')

clf_rf=RandomForestClassifier(n_estimators=500, n_jobs=-1)

clf_voting=VotingClassifier(
            estimators=[('lr', model_LR), ('rf', clf_rf)],
            voting='soft')

clf_bagging=BaggingClassifier(
                DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
                n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

ada_clf = AdaBoostClassifier(
            DecisionTreeClassifier(splitter="random"), n_estimators=200,
            algorithm="SAMME.R", learning_rate=0.5)

In [None]:
## RandomForestClassifier(n_estimators=500, n_jobs=-1)

for i in range(count):
    clf_rf.fit(X_train[i], Y_train[i])
    
    test_yhat  = clf_rf.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

In [None]:
## LogisticRegression(max_iter=500, verbose=2, class_weight='balanced')

for i in range(count):
    model_LR.fit(X_train[i], Y_train[i])
    
    test_yhat  = model_LR.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

In [None]:
## BernoulliNB()

for i in range(count):
    model_BNB.fit(X_train[i], Y_train[i])
    
    test_yhat  = model_BNB.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

In [None]:
## DecisionTreeClassifier(random_state=0, class_weight='balanced')

for i in range(count):
    clf_dt.fit(X_train[i], Y_train[i])
    
    test_yhat  = clf_dt.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

In [None]:
## VotingClassifier(
##            estimators=[('lr', model_LR), ('rf', clf_rf)],
##            voting='soft')

for i in range(count):
    clf_voting.fit(X_train[i], Y_train[i])
    
    test_yhat  = clf_voting.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

In [None]:
## BaggingClassifier(
##                DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
##                n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

for i in range(count):
    clf_bagging.fit(X_train[i], Y_train[i])
    
    test_yhat  = clf_bagging.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

In [None]:
## AdaBoostClassifier(
##            DecisionTreeClassifier(splitter="random"), n_estimators=200,
##            algorithm="SAMME.R", learning_rate=0.5)

for i in range(count):
    ada_clf.fit(X_train[i], Y_train[i])
    
    test_yhat  = ada_clf.predict_proba(X_test[i])[:,-1]
    print('intermediate accuracy : ', metric.roc_auc_score(Y_test[i], test_yhat))

# Model Eveluation 

In [None]:
df_test=pd.read_csv("save-the-attack-contest/test_data.csv")
df_test.head()

In [None]:
ids = df_test['MachineIdentifier']
ids.shape

In [None]:
X_test=pre.test_preprocess(df_test)
X_test.shape

In [None]:
y_pred_hat = clf_rf.predict_proba(df_test)[:, -1]
len(y_pred_hat)

In [None]:
submission_dict={
    "MachineIdentifier": ids,
    "HasDetections": y_pred_hat
}
sub_df=pd.DataFrame(submission_dict)
sub_df.head()

In [None]:
sub_df.to_csv('PO_sub_23_12_11_16_pm.csv', index=False)