In [22]:
#Required Libraries
# !pip install graphviz
# !pip install seaborn
# !pip install pydotplus
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import sklearn
from sklearn.svm import SVC
from sklearn import  datasets
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,plot_confusion_matrix,accuracy_score
from mlxtend.plotting import plot_decision_regions
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import export_graphviz 
import seaborn as sns
from IPython.display import Image  
import pydotplus
from sklearn.naive_bayes import GaussianNB
from scipy.stats import norm
import pandas
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas
from sklearn.utils import shuffle
from sklearn import svm
import xgboost as xgb
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from xgboost import XGBClassifier
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
from pandas.api.types import CategoricalDtype 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve,GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif


In [19]:
np.random.seed(50)
df = pandas.read_csv('KDDTrain_Initial.csv')
df_sub = pandas.read_csv('KDDTest_Initial.csv')

In [23]:
class Preprocessor( BaseEstimator, TransformerMixin ):
    def __init__( self, 
                 cols_tobe_dropped=[],
                 log_vars = [],
                 z_vars = [],
                 minmax_vars = [],
                 bin_vars = [],
                 catunique_vars=[],
                 categorical_vars=[]):
        '''
        Params
        ---------------
        outlier_vars: list
            list of columns to be considered for removal when applying transform
        '''
        
        # Variables to be dropped
        self.cols_tobe_dropped = cols_tobe_dropped
        
        # Variables to scale down using log function
        self.log_vars = [c for c in log_vars if c not in self.cols_tobe_dropped]

        # Variables to apply z normalization on
        self.z_vars = [c for c in z_vars if c not in self.cols_tobe_dropped]

        # Variables to apply min max normalization on
        self.minmax_vars = [c for c in minmax_vars if c not in self.cols_tobe_dropped]

        # Variables to apply binning on
        self.bin_vars = [c for c in bin_vars if c not in self.cols_tobe_dropped]

        # Variables to apply categorization by unique value on
        self.catunique_vars = [*catunique_vars, *bin_vars]
        self.catunique_vars = [c for c in self.catunique_vars if c not in self.cols_tobe_dropped]

        # Variables to one hot encode
        self.categorical_vars = [*categorical_vars, *catunique_vars]
        self.categorical_vars = [c for c in self.categorical_vars if c not in self.cols_tobe_dropped]
    

    def fit( self, df, outlier_removal=False, target_col=None, target_value=None ):
        self.target_col = target_col
        self.target_value = target_value
        
        df_clean = df.copy()
        
        # drop columns
        df_clean = df.drop(self.cols_tobe_dropped, axis=1)

        # Apply Nan and non-finite values removal so that fitting would occur on clean data
        df_clean = self.clean_nans(df_clean)
        
        
        # Apply outlier removal so that fitting would occur on clean data
        if outlier_removal:
            df_clean = self.remove_outlier(df_clean)
        
        # Apply log2 transformation
        self._min_log = [] # the minimum value for each log var
        for v in self.log_vars:
            self._min_log.append(min(df_clean[v]))
            df_clean[v] = np.log2([v if v > 1 else 1 for v in (df_clean[v].values - min(df_clean[v]) + 1)])
        
        # Apply z normalization
        self._trans_z = [] # the standard scaler transformers for each z var
        for z in self.z_vars:
            scaler = StandardScaler()
            scaler.fit(np.reshape(df_clean[z].values, (-1,1)))
            self._trans_z.append(scaler)

        # Apply min max normalization
        self._trans_minmax = [] # the standard scaler transformers for each z var
        for mm in self.minmax_vars:
            scaler = MinMaxScaler()
            scaler.fit(np.reshape(df_clean[mm].values, (-1,1)))
            self._trans_minmax.append(scaler)

        # Apply binning ===============================================
        self._bins = [] # the bins used for each bin var
        for b in self.bin_vars:
            _, bins = pd.qcut(df_clean[b].copy(), q=6, duplicates='drop', retbins=True)
            # expand the bins boundaries to infinity
            bins[0] = -np.inf
            bins[-1] = np.inf
            self._bins.append(bins)
            
            v = pd.cut(df_clean[b], bins, duplicates='drop')
            df_clean[b] = v.astype(str)
            
        
        # Apply categorization by unique value
        self._cat_unique = [] # unique values for each cat unique vars
        for cc in self.catunique_vars:
            unique_values = np.unique(df_clean[cc])
            freq_unique_values = []
            
            # Only take the values that are frequent (repeated more than 5% of the data)
            for u in unique_values:
                if sum(df_clean[cc] == u) > 0.05*len(df_clean):
                    # we need to put the values here so that the categorization transformer would work
                    df_clean.loc[df_clean[cc] == u, cc]  = str(u) 
                    freq_unique_values.append(u)
        
            # Set any value that was not in the transformation set to 'other'
            
            df_clean.loc[df_clean.applymap(np.isreal)[cc],cc] = "other" # same here
            self._cat_unique.append(freq_unique_values)
        
        # Apply One-hot Encoding for categorical variables
        cat_indices = [df_clean.columns.get_loc(col_name) for col_name in self.categorical_vars]
        self._trans_cat = ColumnTransformer(transformers=[('cat', OneHotEncoder(), cat_indices)], remainder='drop', sparse_threshold=0)
        self._trans_cat.fit(df_clean)
        return self

    def transform( self, df, outlier_removal=False ):
        '''
        Params
        ---------------
        outlier_removal: str -- Changed
            strategy for outlier removal, default is None resulting in no outlier removal
            possible string values:
            "before" : apply outlier removal before preprocessing
            "after" : apply outlier removal after preprocessing
        '''
        df_clean = df.copy()
        
        # Drop columns
        df_clean = df.drop(self.cols_tobe_dropped, axis=1).reset_index().iloc[:,1:]
        
        # Apply Nan and non-finite values removal
        df_clean = self.clean_nans(df_clean)
        
        # Apply outlier removal before preprocessing
        if outlier_removal:
            df_clean = self.remove_outlier(df_clean)

        # Apply log2 transformation
        for v, m in zip(self.log_vars, self._min_log):
            df_clean[v] = np.log2([v if v > 1 else 1 for v in (df_clean[v].values - m + 1)])

        # Apply z normalization
        for z, trans in zip(self.z_vars, self._trans_z):
            df_clean[z] = trans.transform(np.reshape(df_clean[z].values, (-1,1)))

        # Apply min max normalization
        for mm, trans in zip(self.minmax_vars, self._trans_minmax):
            df_clean[mm] = trans.transform(np.reshape(df_clean[mm].values, (-1,1)))

        # Apply binning ===============================================
        for b, bins in zip(self.bin_vars, self._bins):
            v = pd.cut(df_clean[b], bins, duplicates='drop')
            df_clean[b] = v.astype(str)

        # Apply categorization by unique value
        for cc, unique_values in zip(self.catunique_vars, self._cat_unique):
            # Set "other' for unkown unique values 
            for u in unique_values:
                df_clean.loc[df_clean[cc] == u, cc]  = str(u) 
            # Set any value that was not in the transformation set to 'other'
            df_clean.loc[df_clean.applymap(np.isreal)[cc],cc] = "other"
        
            
        # Apply One-hot Encoding for categorical variables
        cat_indices = [df_clean.columns.get_loc(col_name) for col_name in self.categorical_vars]
        
        # Contains one hot encoded variables + dataframe variables
        x = np.array(self._trans_cat.transform(df_clean))
        
        # get the columns names for the categorical data columns only
        feature_names = [[f"{c}_{u}" for u in np.unique(df_clean[c])] for c in self.categorical_vars]
        feature_names = [y for x in feature_names for y in x]

        # drop categorical columns from the real dataframe
        df_clean = df_clean.drop(self.categorical_vars, axis=1)

        # create the categorical data frame (contains the categorical colummns)
        df_cat = pd.DataFrame(x[:,:len(feature_names)], columns=feature_names).astype(float)
        # concatenate the dataframe that contains no categorical columns and the one with categorical columns
        df_clean = pd.concat([df_clean, df_cat], axis=1)
        

        
        return df_clean
    
    def remove_outlier(self, df):
        df = df.copy()
        
        # remove numeric outliers
        num_df = df.select_dtypes(include='number')
        
        if self.target_col:
            df = df[~((np.abs(stats.zscore(num_df)) > 3).any(1) & np.array(df[self.target_col]==self.target_value))]
        else:
            df = df[~((np.abs(stats.zscore(num_df)) > 3).any(1))]

#         print(len(df), sum(df[self.target_col]!=self.target_value))
        # remove string (categorical) outliers that are less than 5% of the data
        v = df.select_dtypes(include='object')
        if self.target_col:
            df = df[~(v.replace(v.stack().value_counts()).lt(0.05*len(df)).any(1)& np.array(df[self.target_col]==self.target_value))]
        else:
            df = df[~(v.replace(v.stack().value_counts()).lt(0.05*len(df)).any(1))]
#         print(len(df),sum(df[self.target_col]!=self.target_value))

        return df.reset_index().iloc[:,1:]
    
    def clean_nans(self, df):
        # Remove NaNs
        df.dropna(inplace=True)
        df = df.reset_index().iloc[:,1:]
        
        # Remove non finit values
        num_df = df.select_dtypes(include='number')
        indices = np.unique(np.array(np.where(~(np.isfinite(num_df))))[0])
        df.drop(indices, inplace=True)
        df = df.reset_index().iloc[:,1:]
        
        return df


In [26]:
cols_tobe_dropped = [
                    'urgent',
                    'land',
                    'duration',
                    'ID',
                    'hot',
                    'num_failed_logins',
                    'wrong_fragment',
                    'num_file_creations',
                    'root_shell',
                    "num_compromised",
                    'su_attempted',
                    'num_root',
                    'num_shells',
                    'num_outbound_cmds',
                    'is_host_login',
                    'is_guest_login',
                    'flag',
                    'service',
#                     'protocol_type',
                    'dst_host_serror_rate',
                    'dst_host_srv_serror_rate',
                    'dst_host_rerror_rate',
                    'dst_host_srv_rerror_rate',
                    'num_access_files',
#                     'src_bytes',
#                     'dst_bytes', 
                    'srv_serror_rate',
                    'srv_rerror_rate',
                    'serror_rate',
                    'srv_diff_host_rate',
#                     'dst_host_same_srv_rate',
#                     'logged_in',
                    'rerror_rate',
                    'dst_host_srv_diff_host_rate',
#                     'level',
#                     'count',
                    ]

# Variables to scale down using log function
log_vars = ['src_bytes', 'dst_bytes', 'count', "srv_count"]
log_vars = [c for c in log_vars if c not in cols_tobe_dropped]

# Variables to apply z normalization on
z_vars = ['src_bytes', 'dst_bytes', 'count', 'srv_count', 'level', "wrong_fragment"]
z_vars = [c for c in z_vars if c not in cols_tobe_dropped]

# Variables to apply min max normalization on
minmax_vars = ['dst_host_count', 'dst_host_srv_count']
minmax_vars = [c for c in minmax_vars if c not in cols_tobe_dropped]

# Variables to apply binning on
bin_vars = ['same_srv_rate', 'diff_srv_rate']
bin_vars = [c for c in bin_vars if c not in cols_tobe_dropped]

# Variables to apply categorization by unique value on
catunique_vars = ['dst_host_srv_diff_host_rate', *bin_vars]
catunique_vars = [c for c in catunique_vars if c not in cols_tobe_dropped]

# Variables to one hot encode
categorical_vars = ["protocol_type", "service", "flag", *catunique_vars]
categorical_vars = [c for c in categorical_vars if c not in cols_tobe_dropped]

# Replace this one value "tftp_u" in the test data
df_clean_sub = df_sub.copy()
indices = np.where(df_clean_sub["service"] == "tftp_u")
df_clean_sub["service"][indices[0][0]] = df_clean_sub["service"][0] 


preprocessor = Preprocessor(
    cols_tobe_dropped=cols_tobe_dropped,
    log_vars = log_vars,
    z_vars = z_vars,
    minmax_vars = minmax_vars,
    bin_vars = bin_vars,
    catunique_vars=catunique_vars,
    categorical_vars=categorical_vars)
skb = SelectKBest(f_classif, k=20)

preprocessor.fit(df, outlier_removal=True, target_col='Class', target_value=0)

df_pre = preprocessor.transform(df, outlier_removal=True)
df_sub_pre = preprocessor.transform(df_clean_sub)

x_train = df_pre.loc[:, df_pre.columns != "Class"]
df_sub_pre  = df_sub_pre.loc[:, df_sub_pre.columns != "Class"]
y_train = df_pre["Class"]

skb.fit(x_train,y_train)
x_train = pd.DataFrame(skb.transform(x_train), columns=skb.get_feature_names_out())
df_sub_pre = pd.DataFrame(skb.transform(df_sub_pre), columns=skb.get_feature_names_out())

# reduce the features to 2D
# pca = PCA(n_components=16)
# x_reduced     = pca.fit_transform(x_enc)
# x_reduced_sub = pca.transform(x_enc_sub)
# x_reduced     = x_enc
# x_reduced_sub = x_enc_sub

# print(pca.explained_variance_)


# vis_indices = np.arange(0,1000)
y = np.array(y_train)#[vis_indices]
x = np.array(x_train)#[vis_indices]
x_sub = np.array(df_sub_pre)#[vis_indices]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sub["service"][indices[0][0]] = df_clean_sub["service"][0]


KeyError: "['ID'] not found in axis"

In [3]:
def ChangeCateg(df, testing):
    for i in range(0,len(categ)):
        le = LabelEncoder()
        le.fit(df[categ[i]])
        df[categ[i]] = le.transform(df[categ[i]])
        testing[categ[i]] = le.transform(testing[categ[i]])
    return df, testing
    

In [4]:
categ = ['protocol_type','service','flag','attack']

df , testing=ChangeCateg(df, testing)
# Encode Categorical Columns
#le = LabelEncoder()
#le.fit(df[categ[0]])
#df[categ] = df[categ].apply(le.fit_transform)
#df.head()

In [5]:
print(len(df))
print(len(testing))

125973
22544


In [6]:
df.columns.values.tolist()

['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'attack']

In [7]:
def Preprocessing(df):
    drop=['num_outbound_cmds',]
    df = df.drop(drop,axis=1)
    df.drop_duplicates(keep=False, inplace=True)
    X = df
    y= X.attack
    X = X.drop(['attack'],axis=1)
    
    categ=['protocol_type','service','flag']
    X_categ= X[categ]
    X = X.drop(categ,axis=1)
    #Scaling
    categories=X.columns.values.tolist()
    transformer_MinMax = MinMaxScaler().fit(X_categ)
    X_categ = transformer_MinMax.fit_transform(X_categ)
    
    transformer_Robust = RobustScaler().fit(X)
    #X = transformer_Robust.transform(X)
    
    transformer_Standard = StandardScaler().fit(X)
    X = transformer_Standard.transform(X)
    
    #concat
    print(type(X_categ))
    X_categ=pd.DataFrame(X_categ, columns = categ)
    X=pd.DataFrame(X, columns = categories)
    X = pd.concat([X, X_categ], axis=1)
    
    return X, y, transformer_MinMax, transformer_Robust, transformer_Standard
    
    

In [8]:
def TestPreprocessing(df, transformer_MinMax, transformer_Robust, transformer_Standard):
    drop=['num_outbound_cmds']
    df = df.drop(drop,axis=1)
    df.drop_duplicates(keep=False, inplace=True)
    X = df
    y= X.attack
    X = X.drop(['attack'],axis=1)
    
    categ=['protocol_type','service','flag']
    X_categ= X[categ]
    X = X.drop(categ,axis=1)
    #Scaling
    categories=X.columns.values.tolist()
    #transformer_MinMax = MinMaxScaler().fit(X_categ)
    X_categ = transformer_MinMax.fit_transform(X_categ)
    
    #transformer_Robust = RobustScaler().fit(X)
    #X = transformer_Robust.transform(X)
    
    #transformer_Standard = StandardScaler().fit(X)
    X = transformer_Standard.transform(X)
    
    #concat
    print(type(X_categ))
    X_categ=pd.DataFrame(X_categ, columns = categ)
    X=pd.DataFrame(X, columns = categories)
    X = pd.concat([X, X_categ], axis=1)
    
    return X, y

In [9]:
X, y, transformer_MinMax, transformer_Robust, transformer_Standard= Preprocessing(df)

<class 'numpy.ndarray'>


In [10]:
X_test, y_test =TestPreprocessing(testing, transformer_MinMax, transformer_Robust, transformer_Standard)

<class 'numpy.ndarray'>


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=1)

In [12]:
X_train = X_train.assign(e=pd.Series(y_train).values)
X_train.rename({'e': 'attack'}, axis=1, inplace=True)
print(len(X_train))
X_train.drop_duplicates()
print(len(X_train))
X_train.to_csv('./Train.csv', index=False)

100764
100764


In [13]:
X_val = X_val.assign(e=pd.Series(y_val).values)
X_val.rename({'e': 'attack'}, axis=1, inplace=True)
print(len(X_val))
X_val.drop_duplicates()
print(len(X_val))
X_val.to_csv('./Val.csv', index=False)

25192
25192


In [14]:
X_test = X_test.assign(e=pd.Series(y_test).values)
X_test.rename({'e': 'attack'}, axis=1, inplace=True)
print(len(X_test))
X_test.drop_duplicates()
print(len(X_test))
X_test.to_csv('./Test.csv', index=False)

22538
22538
