In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

names = ["duration","protocoltype","service","flag","srcbytes","dstbytes","land", "wrongfragment","urgent","hot","numfailedlogins","loggedin", "numcompromised","rootshell","suattempted","numroot","numfilecreations", "numshells","numaccessfiles","numoutboundcmds","ishostlogin",
"isguestlogin","count","srvcount","serrorrate", "srvserrorrate","rerrorrate","srvrerrorrate","samesrvrate", "diffsrvrate", "srvdiffhostrate","dsthostcount","dsthostsrvcount","dsthostsamesrvrate", "dsthostdiffsrvrate","dsthostsamesrcportrate",
"dsthostsrvdiffhostrate","dsthostserrorrate","dsthostsrvserrorrate","dsthostrerrorrate","dsthostsrvrerrorrate","attack", "lastflag"]
# Read in the data into a dataframe
df = pd.read_csv("Data/Train.txt",sep=",",names=names)


# df.head()
df.describe()

In [None]:
# Verify that there are no missing values

df.isna().sum()

In [None]:
# We are only interested in normal and attack categories so map all attacks to 1 and normal to 0

df['attack'] = df['attack'].map(lambda x: 0 if x == 'normal' else 1)

In [None]:
# We want all attributes to be numeric, so check what attributes is not numeric

non_numeric_columns = df.select_dtypes(exclude=[int, float, bool]).columns

print("Non numeric columns", non_numeric_columns)

# We can see that protocoltype, service and flag are not numeric. We will convert these to numeric using LabelEncoder
protocol_map = {
        'icmp': 0,
        'tcp': 1,
        'udp': 2
}
    
service_map = {
    'IRC': 0, 'X11': 1, 'Z39_50': 2, 'aol': 3, 'auth': 4, 'bgp': 5, 'courier': 6, 'csnet_ns': 7,
    'ctf': 8, 'daytime': 9, 'discard': 10, 'domain': 11, 'domain_u': 12, 'echo': 13, 'eco_i': 14,
    'ecr_i': 15, 'efs': 16, 'exec': 17, 'finger': 18, 'ftp': 19, 'ftp_data': 20, 'gopher': 21,
    'harvest': 22, 'hostnames': 23, 'http': 24, 'http_2784': 25, 'http_443': 26, 'http_8001': 27,
    'imap4': 28, 'iso_tsap': 29, 'klogin': 30, 'kshell': 31, 'ldap': 32, 'link': 33, 'login': 34,
    'mtp': 35, 'name': 36, 'netbios_dgm': 37, 'netbios_ns': 38, 'netbios_ssn': 39, 'netstat': 40,
    'nnsp': 41, 'nntp': 42, 'ntp_u': 43, 'other': 44, 'pm_dump': 45, 'pop_2': 46, 'pop_3': 47,
    'printer': 48, 'private': 49, 'red_i': 50, 'remote_job': 51, 'rje': 52, 'shell': 53, 'smtp': 54,
    'sql_net': 55, 'ssh': 56, 'sunrpc': 57, 'supdup': 58, 'systat': 59, 'telnet': 60, 'tftp_u': 61,
    'tim_i': 62, 'time': 63, 'urh_i': 64, 'urp_i': 65, 'uucp': 66, 'uucp_path': 67, 'vmnet': 68,
    'whois': 69
}
    
flag_map = {
    'OTH': 0, 'REJ': 1, 'RSTO': 2, 'RSTOS0': 3, 'RSTR': 4, 'S0': 5, 'S1': 6, 'S2': 7, 'S3': 8,
    'SF': 9, 'SH': 10
}

def map_func(x):
    if x in protocol_map:
        return protocol_map[x]
    elif x in service_map:
        return service_map[x]
    elif x in flag_map:
        return flag_map[x]

df['protocoltype'] = df['protocoltype'].map(map_func)
df['service'] = df['service'].map(map_func)
df['flag'] = df['flag'].map(map_func)

df.head()


In [None]:
# df = df.drop(["numoutboundcmds"], axis=1)
plt.figure(figsize=(25, 15))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
# sns.heatmap(correlation_matrix)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# From the correlation heatmap we can see that there are some attributes that are highly correlated. We will remove these attributes as they do not add any value to the model.

df = df.drop(["dstbytes", "urgent", "hot", "srcbytes", "land", "numfailedlogins", "numroot", "rootshell", "numcompromised", "numoutboundcmds" ], axis=1)

plt.figure(figsize=(25, 15))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
# sns.heatmap(correlation_matrix)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# X=df.drop(['attack'],axis=1)
# y=df['attack']


# scaler = StandardScaler()
# scaler.fit(X)
# X_transformed = scaler.transform(X)

# # df = pd.concat([pd.DataFrame(X_transformed), y], axis=1)

# lr=LogisticRegression() # creates an instance of the Logistic Regression model. 
# lr.fit(X_transformed,y) #  trains the Logistic Regression model on the training data. X_transformed represents the feature matrix (input variables), and y represents the target variable (labels or classes).
# lr_pred=lr.predict(X_transformed) # generates predictions for the training set based on the trained Logistic Regression model.

# lr_df=pd.DataFrame()
# lr_df['actual']=y
# lr_df['pred']=lr_pred

# print(accuracy_score(y, lr_pred))


dataset_1 = df[df['attack'] == 1]
dataset_0 = df[df['attack'] == 0]

df = pd.concat([dataset_1.sample(frac=1, random_state=26), dataset_0.sample(frac=0.9, random_state=26)])


print(df['attack'].value_counts())

df.to_csv("Data/Train_cleaned.csv", index=False)