In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk
from prettytable import PrettyTable  # To print in tabular format

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

In [None]:
path="/content/drive/MyDrive/PPP/IDS_DATASET"

In [None]:
import pickle
with open('/content/drive/MyDrive/PPP/IDS_DATASET/saved_dict', 'rb') as f:
    saved_dict = pickle.load(f)

print(saved_dict)

{'columns': ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat'], 'binary_col': ['is_sm_ips_ports', 'is_ftp_login']}


In [None]:
train = pd.read_csv('/content/drive/MyDrive/PPP/IDS_DATASET/train_alldata_EDA.csv')

In [None]:
corr_matrix = train.corr().abs()

In [None]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

It checks for columns in the DataFrame upper where any correlation value is greater than 0.95. If a column has a correlation higher than 0.95 with at least one other column, it is considered highly correlated.

In [None]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
to_drop

['sloss',
 'dloss',
 'dpkts',
 'dwin',
 'ltime',
 'ct_srv_dst',
 'ct_src_dport_ltm',
 'ct_dst_src_ltm']

In [None]:
train["label"].to_csv(path+"/labeled_train.csv",index=False)

In [None]:
train.drop(["label"],axis=1, inplace=True)

In [None]:
train.drop(['srcip', 'sport', 'dstip', 'dsport'], axis=1, inplace=True)

In [None]:
train_cnn=train.copy()

In [None]:
saved_dict['corr_col'] = to_drop

In [None]:
train.drop(columns=to_drop, inplace=True)

In [None]:
train_cnn.drop(columns=to_drop, inplace=True)

In [None]:
train.shape

(1016018, 36)

In [None]:
train_cnn.shape

(1016018, 36)

In [None]:
train['network_bytes'] = train['sbytes'] + train['dbytes']

In [None]:
train_cnn['network_bytes'] = train_cnn['sbytes'] + train_cnn['dbytes']

In [None]:
saved_dict["to_drop"]=['srcip', 'sport', 'dstip', 'dsport']

In [None]:
train.shape

(1016018, 37)

In [None]:
train_cnn.shape

(1016018, 37)

In [None]:
col_unique_values = train.nunique()

identify the columns in a dataset that have more than 200 unique values and apply the log1p transformation to correct their skewness

In [None]:
col = col_unique_values[col_unique_values>200].index

In [None]:
col

Index(['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb',
       'smeansz', 'dmeansz', 'res_bdy_len', 'sjit', 'djit', 'stime', 'sintpkt',
       'dintpkt', 'tcprtt', 'synack', 'ackdat', 'network_bytes'],
      dtype='object')

In [None]:
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit', 'network_bytes']

In [None]:
saved_dict['log1p_col'] = log1p_col

In [None]:
def log1p_transform(col, df=train):
    new_col = col+'_log1p'
    df[new_col] = df[col].apply(np.log1p)
    df.drop(col, axis=1, inplace=True)

In [None]:
for col in log1p_col:
    log1p_transform(col, df=train)

In [None]:
for col in log1p_col:
    log1p_transform(col, df=train_cnn)

In [None]:
train.shape

(1016018, 37)

In [None]:
train_cnn.shape

(1016018, 37)

In [None]:
train.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'attack_cat', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:
attack_encoder=LabelEncoder()

In [None]:
train_cnn = train_cnn[train_cnn['attack_cat'] != 'normal']

In [None]:
train_cnn = train_cnn.drop(train_cnn[train_cnn['attack_cat'] == 'normal'].index)

In [None]:
train_cnn = train_cnn.reset_index(drop=True)

In [None]:
train_cnn["attack_cat"]

0                generic
1                generic
2                generic
3               exploits
4               exploits
               ...      
128311           generic
128312    reconnaissance
128313           generic
128314           fuzzers
128315           fuzzers
Name: attack_cat, Length: 128316, dtype: object

In [None]:
replacement_mapping = {
    "shellcode": "other",
    "backdoor": "other",
    "analysis": "other",
    "dos": "other",
    "worms": "other"
}
train_cnn["attack_cat"] = train_cnn["attack_cat"].replace(replacement_mapping)

In [None]:
x_train= train.drop(columns=['attack_cat'])
x_train_cnn,y_train_cnn=train_cnn.drop(columns=['attack_cat']), attack_encoder.fit_transform(train_cnn['attack_cat'])

In [None]:
print(x_train.shape)

(1016018, 36)


In [None]:
print(x_train_cnn.shape)

(128316, 36)


In [None]:
cat_col = ['proto', 'service', 'state']
num_col = list(set(x_train.columns) - set(cat_col))

In [None]:
saved_dict['cat_col'] = cat_col
saved_dict['num_col'] = num_col

In [None]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,31,29,Non,0,0,0,1424224514,1.219333,...,13.49834,12.899222,1.609438,0.0,0.0,4.962845,4.369448,0.998607,0.89745,6.781058
1,udp,CON,31,29,Non,0,0,0,1424238737,0.402667,...,14.519448,13.96738,1.609438,0.0,0.0,4.890349,4.343805,0.44757,0.243181,6.725034
2,udp,CON,31,29,dns,0,0,0,1424228072,0.009,...,13.265729,13.463906,1.098612,0.0,0.0,4.304065,4.49981,0.0,0.0,5.783825
3,udp,INT,254,0,dns,0,0,0,1421932012,0.01,...,17.635418,0.0,1.098612,0.0,0.0,4.060443,0.0,0.0,0.0,4.744932
4,udp,INT,254,0,dns,0,0,0,1424238347,0.005,...,18.328565,0.0,1.098612,0.0,0.0,4.060443,0.0,0.0,0.0,4.744932


Applying Standard Scler on x_train and x_train_cnn

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(x_train[num_col])

In [None]:
x_train[num_col]=scaler.transform(x_train[num_col])

In [None]:
x_train

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,-0.425227,-0.040901,Non,-1.196443,-0.198505,-0.089491,0.850317,-0.069062,...,-0.039364,0.387033,-0.705200,-1.190417,-1.190141,0.683803,0.038839,-0.686958,-0.581499,-0.548926
1,udp,CON,-0.425227,-0.040901,Non,-1.196443,-0.198505,-0.089491,0.862853,-0.069356,...,0.285788,0.574959,-0.705200,-1.190417,-1.190141,0.567345,0.027738,-0.857449,-0.813872,-0.572888
2,udp,CON,-0.425227,-0.040901,dns,-1.196443,-0.198505,-0.089491,0.853453,-0.069497,...,-0.113435,0.486380,-1.081013,-1.190417,-1.190141,-0.374472,0.095278,-0.995927,-0.900241,-0.975452
3,udp,INT,2.567365,-0.718852,dns,-1.196443,-0.198505,-0.089491,-1.170238,-0.069497,...,1.278009,-1.882381,-1.081013,-1.190417,-1.190141,-0.765831,-1.852846,-0.995927,-0.900241,-1.419796
4,udp,INT,2.567365,-0.718852,dns,-1.196443,-0.198505,-0.089491,0.862509,-0.069499,...,1.498729,-1.882381,-1.081013,-1.190417,-1.190141,-0.765831,-1.852846,-0.995927,-0.900241,-1.419796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016013,udp,CON,-0.425227,-0.040901,Non,-1.196443,-0.198505,-0.089491,0.861607,-0.069053,...,-0.034289,0.389837,-0.705200,-1.190417,-1.190141,0.683803,0.038839,-0.682425,-0.574940,-0.548926
1016014,udp,CON,-0.425227,-0.040901,Non,-1.196443,-0.198505,-0.089491,0.876051,-0.023728,...,-1.453633,-0.389886,-0.705200,-1.190417,-1.190141,0.683803,0.049663,0.612619,0.908240,-0.545060
1016015,udp,CON,-0.425227,-0.040901,dns,-1.196443,-0.198505,-0.089491,-1.170987,-0.069500,...,-0.161222,0.463828,-1.081013,-1.190417,-1.190141,-0.558263,0.054975,-0.995927,-0.900241,-1.019785
1016016,tcp,FIN,-0.425227,-0.040901,Non,0.835819,-0.198505,-0.089491,-1.170988,-0.069403,...,0.623347,0.781828,0.276956,0.924729,0.851938,1.316317,0.277184,-0.171061,-0.797324,0.289554


Apply one hot Encoding the columns service,proto and state,it is beneficial because of 



*   Categorical Representation: Machine learning algorithms typically operate on numerical data. By converting categorical variables into numerical representations, such as one-hot encoding, we enable the algorithms to process and learn from these variables effectively.
* Capturing Non-Linear Relationships: One-hot encoding represents each unique category as a binary feature, indicating the presence or absence of that category. This approach allows the model to capture non-linear relationships between the categories and the target variable.  





In [None]:
service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(x_train.service.values.reshape(-1,1))
ohe_proto = proto_.fit(x_train.proto.values.reshape(-1,1))
ohe_state = state_.fit(x_train.state.values.reshape(-1,1))

In [None]:
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(x_train[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
    x_train = pd.concat([x_train.drop(col, axis=1), tmp_df], axis=1)

In [None]:
x_train.shape

(1016018, 197)

In [None]:
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(x_train_cnn[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
    x_train_cnn = pd.concat([x_train_cnn.drop(col, axis=1), tmp_df], axis=1)


In [None]:
x_train_cnn.shape

(128316, 197)

In [None]:
x_train

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,-0.425227,-0.040901,-1.196443,-0.198505,-0.089491,0.850317,-0.069062,-0.054498,-0.134875,-0.127975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.425227,-0.040901,-1.196443,-0.198505,-0.089491,0.862853,-0.069356,-0.055081,-0.134875,-0.127975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.425227,-0.040901,-1.196443,-0.198505,-0.089491,0.853453,-0.069497,-0.055217,-0.134875,-0.127975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.567365,-0.718852,-1.196443,-0.198505,-0.089491,-1.170238,-0.069497,-0.055218,-0.134875,-0.127975,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.567365,-0.718852,-1.196443,-0.198505,-0.089491,0.862509,-0.069499,-0.055218,-0.134875,-0.127975,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016013,-0.425227,-0.040901,-1.196443,-0.198505,-0.089491,0.861607,-0.069053,-0.054478,-0.134875,-0.127975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016014,-0.425227,-0.040901,-1.196443,-0.198505,-0.089491,0.876051,-0.023728,0.024276,-0.134875,-0.127975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016015,-0.425227,-0.040901,-1.196443,-0.198505,-0.089491,-1.170987,-0.069500,-0.055211,-0.134875,-0.127975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016016,-0.425227,-0.040901,0.835819,-0.198505,-0.089491,-1.170988,-0.069403,-0.055040,-0.119673,-0.106001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
saved_dict["columns"].remove("attack_cat")

In [None]:
pickle.dump(scaler, open(path+'/scaler.pkl', 'wb'))  # Standard scaler
pickle.dump(saved_dict, open(path+'/saved_dict.pkl', 'wb'))  # Dictionary with important parameters


In [None]:
pickle.dump(ohe_proto, open(path+'/ohe_proto.pkl', 'wb'))
pickle.dump(ohe_service, open(path+'/ohe_service.pkl', 'wb'))
pickle.dump(ohe_state, open(path+'/ohe_state.pkl', 'wb'))

In [None]:
x_train.to_csv(path+"/final_train.csv",index=False)

In [None]:
x_train.shape

In [None]:
x_train_cnn.to_csv(path+"/final_train_cnn.csv",index=False)

In [None]:
x_train_cnn.shape

In [None]:
x_train.columns

In [None]:
attack_cat_cnn=pd.Series(attack_encoder.inverse_transform(y_train_cnn))

In [None]:
y_train_cnn=pd.Series(y_train_cnn)

In [None]:
label_train_cnn={"attack_cat":attack_cat_cnn,"y_train":y_train_cnn}

In [None]:
label_train_cnn = pd.DataFrame(label_train_cnn)

In [None]:
label_train_cnn.to_csv(path+"/label_train_cnn.csv",index=False)