In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [2]:
data_train=pd.read_csv('nsl-kdd/KDDTrain+.txt', header=None)

In [3]:
data_train.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
                      'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                      'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
                      'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
                      'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                      'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                      'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                      'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'level']

In [4]:
def print_percentage_of_unique_values(column):
    value_counts = column.value_counts(normalize=True) * 100
    print(f"Column: {column.name}")
    for value, percentage in value_counts.items():
        print(f"{value}: {percentage:.2f}%")
    print()

# Applying the function to each column in the DataFrame
for column in data_train.columns:
    print_percentage_of_unique_values(data_train[column])

Column: duration
0: 92.05%
1: 1.58%
2: 0.67%
3: 0.44%
4: 0.28%
5: 0.24%
27: 0.16%
6: 0.15%
28: 0.14%
7: 0.10%
10: 0.10%
8: 0.08%
12: 0.08%
9: 0.08%
30: 0.07%
23: 0.07%
15: 0.06%
11: 0.06%
31: 0.06%
21: 0.06%
29: 0.06%
20: 0.05%
22: 0.05%
25: 0.05%
26: 0.05%
14: 0.04%
13: 0.04%
24: 0.04%
16: 0.04%
32: 0.03%
17: 0.03%
19: 0.03%
18: 0.02%
33: 0.02%
36: 0.02%
37: 0.01%
40: 0.01%
35: 0.01%
39: 0.01%
2630: 0.01%
38: 0.01%
34: 0.01%
45: 0.01%
60: 0.01%
63: 0.01%
5051: 0.01%
2620: 0.01%
2625: 0.01%
43: 0.01%
42: 0.01%
46: 0.01%
49: 0.01%
300: 0.01%
47: 0.01%
41: 0.01%
48: 0.00%
2470: 0.00%
70: 0.00%
93: 0.00%
44: 0.00%
5044: 0.00%
67: 0.00%
2474: 0.00%
58: 0.00%
540: 0.00%
107: 0.00%
102: 0.00%
2705: 0.00%
56: 0.00%
57: 0.00%
5057: 0.00%
2713: 0.00%
59: 0.00%
2585: 0.00%
53: 0.00%
4545: 0.00%
2638: 0.00%
62: 0.00%
65: 0.00%
2594: 0.00%
2710: 0.00%
5063: 0.00%
3030: 0.00%
1730: 0.00%
232: 0.00%
3995: 0.00%
3924: 0.00%
98: 0.00%
150: 0.00%
5056: 0.00%
189: 0.00%
101: 0.00%
179: 0.00%
96: 0.00%
6

3379: 0.00%
30755: 0.00%
3813: 0.00%
17804: 0.00%
885: 0.00%
17277: 0.00%
39551: 0.00%
2064: 0.00%
9254: 0.00%
375: 0.00%
27461: 0.00%
30297: 0.00%
8575: 0.00%
2996: 0.00%
1318: 0.00%
9781: 0.00%
12114: 0.00%
1861: 0.00%
40963: 0.00%
27361: 0.00%
12100: 0.00%
36510: 0.00%
23152: 0.00%
1141: 0.00%
6649: 0.00%
3173: 0.00%
5141: 0.00%
39133: 0.00%
16985: 0.00%
3711: 0.00%
6217: 0.00%
13618: 0.00%
1948: 0.00%
1061: 0.00%
6760: 0.00%
12353: 0.00%
8461: 0.00%
2592: 0.00%
32442: 0.00%
11010: 0.00%
5081: 0.00%
37313: 0.00%
2696: 0.00%
37572: 0.00%
31214: 0.00%
291: 0.00%
16534: 0.00%
14682: 0.00%
7993: 0.00%
39647: 0.00%
10132: 0.00%
8755: 0.00%
13520: 0.00%
12005: 0.00%
9732: 0.00%
22721: 0.00%
12873: 0.00%
11398: 0.00%
18478: 0.00%
37205: 0.00%
22376: 0.00%
134: 0.00%
4823: 0.00%
9285: 0.00%
5052: 0.00%
12713: 0.00%
1779: 0.00%
3343: 0.00%
13388: 0.00%
2309: 0.00%
39487: 0.00%
1174: 0.00%
10256: 0.00%
26327: 0.00%
6309: 0.00%
14420: 0.00%
1973: 0.00%
422: 0.00%
38460: 0.00%
13878: 0.00%
438:

6917: 0.00%
23120: 0.00%
10706: 0.00%
22556: 0.00%
15244: 0.00%
6372: 0.00%
38818: 0.00%
9093: 0.00%
36741: 0.00%
25910: 0.00%
4961: 0.00%
6531: 0.00%
4721: 0.00%
2845: 0.00%
727: 0.00%
914: 0.00%
8675: 0.00%
708: 0.00%
42260: 0.00%
432: 0.00%
4400: 0.00%
36653: 0.00%
4816: 0.00%
38657: 0.00%
2475: 0.00%
2333: 0.00%
638: 0.00%
18521: 0.00%
16744: 0.00%
39313: 0.00%
37668: 0.00%
2059: 0.00%
7412: 0.00%
284: 0.00%
276: 0.00%
9801: 0.00%
931: 0.00%
13141: 0.00%
14360: 0.00%
31829: 0.00%
4965: 0.00%
17784: 0.00%
1198: 0.00%
13308: 0.00%
37498: 0.00%
36429: 0.00%
1278: 0.00%
10103: 0.00%
16965: 0.00%
30217: 0.00%
1195: 0.00%
7999: 0.00%
5300: 0.00%
159: 0.00%
21767: 0.00%
27241: 0.00%
3106: 0.00%
156: 0.00%
36918: 0.00%
2289: 0.00%
40295: 0.00%
2530: 0.00%
27421: 0.00%
38384: 0.00%
8359: 0.00%
41307: 0.00%
2420: 0.00%
7850: 0.00%
2540: 0.00%
41218: 0.00%
1900: 0.00%
31581: 0.00%
1525: 0.00%
10835: 0.00%
38404: 0.00%
12044: 0.00%
13430: 0.00%
1045: 0.00%
17903: 0.00%
1235: 0.00%
17025: 0.00%

Column: src_bytes
0: 39.21%
8: 2.93%
1: 1.93%
44: 1.85%
45: 1.66%
1032: 1.59%
46: 1.03%
43: 1.02%
105: 0.79%
147: 0.75%
54540: 0.71%
28: 0.71%
42: 0.61%
30: 0.56%
146: 0.54%
520: 0.51%
334: 0.44%
18: 0.44%
215: 0.36%
207: 0.32%
224: 0.31%
216: 0.29%
221: 0.29%
222: 0.29%
516: 0.29%
214: 0.29%
209: 0.28%
217: 0.28%
232: 0.27%
218: 0.27%
235: 0.27%
230: 0.26%
229: 0.26%
219: 0.26%
233: 0.26%
145: 0.26%
225: 0.25%
213: 0.25%
208: 0.25%
245: 0.25%
212: 0.25%
234: 0.25%
226: 0.25%
211: 0.25%
220: 0.25%
210: 0.25%
223: 0.24%
192: 0.24%
236: 0.24%
227: 0.24%
231: 0.23%
306: 0.23%
305: 0.23%
237: 0.23%
206: 0.22%
205: 0.22%
228: 0.22%
309: 0.21%
297: 0.21%
204: 0.21%
12: 0.20%
293: 0.20%
238: 0.20%
307: 0.20%
295: 0.20%
308: 0.20%
316: 0.20%
200: 0.20%
294: 0.20%
241: 0.19%
203: 0.19%
201: 0.19%
303: 0.19%
299: 0.19%
239: 0.19%
244: 0.19%
296: 0.19%
383: 0.18%
317: 0.18%
302: 0.18%
78: 0.18%
325: 0.18%
304: 0.18%
240: 0.18%
246: 0.18%
300: 0.18%
242: 0.18%
298: 0.18%
319: 0.18%
314: 0.18%
243:

425: 0.00%
1808: 0.00%
985: 0.00%
40495: 0.00%
730: 0.00%
395: 0.00%
1623: 0.00%
1644: 0.00%
990: 0.00%
1472: 0.00%
1779: 0.00%
1952: 0.00%
1155: 0.00%
1410: 0.00%
1514: 0.00%
1679: 0.00%
684: 0.00%
1364: 0.00%
455: 0.00%
1652: 0.00%
1345: 0.00%
53452: 0.00%
1420: 0.00%
1655: 0.00%
1446: 0.00%
618: 0.00%
606: 0.00%
1403: 0.00%
1476: 0.00%
2163: 0.00%
1440: 0.00%
812: 0.00%
1198: 0.00%
50: 0.00%
1532: 0.00%
1713: 0.00%
846: 0.00%
2039: 0.00%
1393: 0.00%
1795: 0.00%
1283: 0.00%
1956: 0.00%
1098: 0.00%
1722: 0.00%
686: 0.00%
3475: 0.00%
1708: 0.00%
972: 0.00%
1124: 0.00%
1469: 0.00%
1789: 0.00%
406: 0.00%
996: 0.00%
517: 0.00%
601: 0.00%
792: 0.00%
1960: 0.00%
2029: 0.00%
2701: 0.00%
1654: 0.00%
2754: 0.00%
1054: 0.00%
1031: 0.00%
1059: 0.00%
439: 0.00%
1269: 0.00%
13140: 0.00%
2900: 0.00%
1511: 0.00%
717: 0.00%
1838: 0.00%
1145: 0.00%
1369: 0.00%
1301: 0.00%
57: 0.00%
1539: 0.00%
1102: 0.00%
742: 0.00%
1315: 0.00%
2704: 0.00%
1258: 0.00%
1070: 0.00%
823: 0.00%
1782: 0.00%
775: 0.00%
2522

3025: 0.00%
1962: 0.00%
3465: 0.00%
620: 0.00%
204881: 0.00%
3660: 0.00%
8641: 0.00%
14305: 0.00%
2865: 0.00%
5840: 0.00%
2317: 0.00%
11610: 0.00%
2312: 0.00%
3044: 0.00%
1842: 0.00%
3844: 0.00%
2768: 0.00%
4568: 0.00%
2478: 0.00%
43003: 0.00%
3229: 0.00%
12737: 0.00%
1575: 0.00%
2007160: 0.00%
5612: 0.00%
2574: 0.00%
4122: 0.00%
51180: 0.00%
20438: 0.00%
3052: 0.00%
381709090: 0.00%
11867: 0.00%
2692: 0.00%
36040: 0.00%
2394: 0.00%
2396: 0.00%
2104380: 0.00%
20309: 0.00%
3740: 0.00%
1709: 0.00%
2808: 0.00%
22270: 0.00%
2762: 0.00%
494: 0.00%
2397: 0.00%
11406: 0.00%
16551: 0.00%
11244: 0.00%
2285: 0.00%
2496: 0.00%
3239: 0.00%
3697: 0.00%
2464: 0.00%
3286: 0.00%
68620: 0.00%
424: 0.00%
12035: 0.00%
1777: 0.00%
1969: 0.00%
4364: 0.00%
2554: 0.00%
2367: 0.00%
4060: 0.00%
586: 0.00%
7727: 0.00%
6285: 0.00%
2278: 0.00%
14384: 0.00%
1918: 0.00%
630: 0.00%
3298: 0.00%
11707: 0.00%
1738: 0.00%
3233: 0.00%
11926: 0.00%
2752: 0.00%
2037: 0.00%
2839: 0.00%
1758: 0.00%
1850: 0.00%
14416: 0.00%
9

294: 0.07%
279: 0.07%
269: 0.06%
115: 0.06%
81: 0.06%
296: 0.06%
398: 0.06%
362: 0.06%
310: 0.06%
281: 0.06%
339: 0.06%
388: 0.06%
369: 0.06%
321: 0.06%
366: 0.06%
391: 0.05%
82: 0.05%
307: 0.05%
387: 0.05%
774: 0.05%
282: 0.05%
263: 0.05%
124: 0.05%
389: 0.05%
608: 0.05%
390: 0.05%
3380: 0.05%
284: 0.05%
179: 0.05%
788: 0.05%
38: 0.05%
364: 0.05%
458: 0.05%
259: 0.05%
368: 0.05%
2445: 0.05%
434: 0.05%
301: 0.05%
37: 0.05%
622: 0.05%
308: 0.05%
354: 0.04%
112: 0.04%
2531: 0.04%
1227: 0.04%
278: 0.04%
2720: 0.04%
140: 0.04%
386: 0.04%
39: 0.04%
363: 0.04%
188: 0.04%
367: 0.04%
3081: 0.04%
480: 0.04%
114: 0.04%
903: 0.04%
1054: 0.04%
1695: 0.04%
1425: 0.04%
770: 0.04%
260: 0.04%
109: 0.04%
290: 0.04%
1215: 0.04%
293: 0.04%
274: 0.04%
1719: 0.04%
762: 0.04%
12884: 0.04%
83: 0.04%
511: 0.04%
253: 0.04%
2431: 0.04%
34: 0.04%
405: 0.04%
267: 0.04%
113: 0.04%
318: 0.04%
340: 0.04%
15: 0.04%
468: 0.04%
88: 0.04%
482: 0.04%
1415: 0.04%
3200: 0.04%
361: 0.04%
481: 0.04%
123: 0.04%
1645: 0.03%
16

1560: 0.01%
3897: 0.01%
5238: 0.01%
9146: 0.01%
1750: 0.01%
3461: 0.01%
1733: 0.01%
1338: 0.01%
3758: 0.01%
3393: 0.01%
2472: 0.01%
853: 0.01%
2020: 0.01%
897: 0.01%
1280: 0.01%
1170: 0.01%
1001: 0.01%
2999: 0.01%
1637: 0.01%
2273: 0.01%
154: 0.01%
1124: 0.01%
1281: 0.01%
3913: 0.01%
1401: 0.01%
2654: 0.01%
2125: 0.01%
1496: 0.01%
2254: 0.01%
106: 0.01%
1762: 0.01%
8315: 0.01%
1609: 0.01%
1341: 0.01%
2324: 0.01%
2317: 0.01%
16060: 0.01%
51: 0.01%
2316: 0.01%
841: 0.01%
2193: 0.01%
1707: 0.01%
1442: 0.01%
6418: 0.01%
2322: 0.01%
537: 0.01%
1991: 0.01%
726: 0.01%
4143: 0.01%
148: 0.01%
1543: 0.01%
858: 0.01%
1417: 0.01%
2719: 0.01%
3270: 0.01%
2289: 0.01%
789: 0.01%
981: 0.01%
638: 0.01%
2120: 0.01%
1444: 0.01%
1308: 0.01%
2140: 0.01%
2966: 0.01%
1728: 0.01%
24786: 0.01%
2566: 0.01%
2769: 0.01%
904: 0.01%
2108: 0.01%
1522: 0.01%
3074: 0.01%
1643: 0.01%
2492: 0.00%
103: 0.00%
9428: 0.00%
1901: 0.00%
871: 0.00%
1379: 0.00%
1229: 0.00%
8497: 0.00%
639: 0.00%
3191: 0.00%
1534: 0.00%
3008: 0.

8157: 0.00%
6217: 0.00%
965: 0.00%
2341: 0.00%
15340: 0.00%
43129: 0.00%
2165: 0.00%
1872: 0.00%
4134: 0.00%
10111: 0.00%
1177: 0.00%
9531: 0.00%
5163: 0.00%
6232: 0.00%
1514: 0.00%
699: 0.00%
926: 0.00%
12149: 0.00%
7927: 0.00%
1949: 0.00%
2634: 0.00%
3581: 0.00%
2944: 0.00%
1070: 0.00%
865: 0.00%
4864: 0.00%
6052: 0.00%
1331: 0.00%
6409: 0.00%
1788: 0.00%
459: 0.00%
15511: 0.00%
2638: 0.00%
1355: 0.00%
2810: 0.00%
2563: 0.00%
1190: 0.00%
3319: 0.00%
5683: 0.00%
2330: 0.00%
5071: 0.00%
4721: 0.00%
1850: 0.00%
1910: 0.00%
1447: 0.00%
6979: 0.00%
1458: 0.00%
2824: 0.00%
1289: 0.00%
14183: 0.00%
4165: 0.00%
7104: 0.00%
9038: 0.00%
1343: 0.00%
2807: 0.00%
4743: 0.00%
1071: 0.00%
2535: 0.00%
1647: 0.00%
14206: 0.00%
6137: 0.00%
11415: 0.00%
2792: 0.00%
1846: 0.00%
1517: 0.00%
10333: 0.00%
10715: 0.00%
1922: 0.00%
883: 0.00%
2557: 0.00%
7156: 0.00%
2607: 0.00%
31639: 0.00%
8760: 0.00%
2441: 0.00%
2004: 0.00%
9525: 0.00%
3110: 0.00%
1438: 0.00%
4070: 0.00%
2406: 0.00%
1186: 0.00%
10431: 0.00

2128: 0.00%
2601: 0.00%
3366: 0.00%
5816: 0.00%
8311: 0.00%
6879: 0.00%
6678: 0.00%
15767: 0.00%
1700: 0.00%
15279: 0.00%
2879: 0.00%
2048: 0.00%
12218: 0.00%
9114: 0.00%
13332: 0.00%
3614: 0.00%
11546: 0.00%
621: 0.00%
6907: 0.00%
22368: 0.00%
8790: 0.00%
11541: 0.00%
6590: 0.00%
26597: 0.00%
21435: 0.00%
4586: 0.00%
6237: 0.00%
8871: 0.00%
11688: 0.00%
52451: 0.00%
11700: 0.00%
6390: 0.00%
13593: 0.00%
5478: 0.00%
5636: 0.00%
2130: 0.00%
20979: 0.00%
8033: 0.00%
27411: 0.00%
7491: 0.00%
2921: 0.00%
1990: 0.00%
10319: 0.00%
4906: 0.00%
13965: 0.00%
8615: 0.00%
4723: 0.00%
14934: 0.00%
1006: 0.00%
400291060: 0.00%
32439: 0.00%
12523: 0.00%
31998: 0.00%
4360: 0.00%
14313: 0.00%
8485: 0.00%
17039: 0.00%
6443: 0.00%
60: 0.00%
34548: 0.00%
5965: 0.00%
10920: 0.00%
4135: 0.00%
4849: 0.00%
6210: 0.00%
12120: 0.00%
3330: 0.00%
11379: 0.00%
10576: 0.00%
6675: 0.00%
8076: 0.00%
5408: 0.00%
1153: 0.00%
12426: 0.00%
3925: 0.00%
11750: 0.00%
1706: 0.00%
9042: 0.00%
7549: 0.00%
15405: 0.00%
35811: 

9847: 0.00%
6902: 0.00%
2735: 0.00%
1348: 0.00%
2686: 0.00%
7596: 0.00%
6496: 0.00%
33586: 0.00%
5678: 0.00%
7461: 0.00%
3831: 0.00%
5696: 0.00%
9001: 0.00%
5888: 0.00%
12067: 0.00%
10481: 0.00%
2115: 0.00%
4060: 0.00%
3236: 0.00%
2728: 0.00%
2771: 0.00%
10269: 0.00%
7648: 0.00%
3895: 0.00%
5773: 0.00%
9067: 0.00%
2516: 0.00%
1777: 0.00%
7197: 0.00%
11355: 0.00%
8415: 0.00%
2498: 0.00%
52465: 0.00%
2963: 0.00%
6406: 0.00%
15626: 0.00%
5905: 0.00%
10311: 0.00%
2532: 0.00%
80476: 0.00%
5949: 0.00%
11111: 0.00%
44324: 0.00%
708: 0.00%
2539: 0.00%
9979: 0.00%
13341: 0.00%
4951: 0.00%
1644: 0.00%
1625: 0.00%
10616: 0.00%
6509: 0.00%
7775: 0.00%
11271: 0.00%
5197: 0.00%
3058: 0.00%
1141: 0.00%
1622: 0.00%
7065: 0.00%
7071: 0.00%
4240: 0.00%
1845: 0.00%
4471: 0.00%
15252: 0.00%
2471: 0.00%
6726: 0.00%
9393: 0.00%
8104: 0.00%
1440: 0.00%
3654: 0.00%
2352: 0.00%
9562: 0.00%
197: 0.00%
18961: 0.00%
6758: 0.00%
27544: 0.00%
13356: 0.00%
14875: 0.00%
2274: 0.00%
1104: 0.00%
11633: 0.00%
3490: 0.00

16414: 0.00%
21281: 0.00%
77216: 0.00%
9370: 0.00%
5356: 0.00%
18765: 0.00%
25520: 0.00%
3281: 0.00%
9407: 0.00%
6873: 0.00%
6037: 0.00%
8190: 0.00%
11120: 0.00%
18022: 0.00%
10594: 0.00%
40552: 0.00%
9129: 0.00%
5946: 0.00%
31360: 0.00%
13804: 0.00%
11180: 0.00%
18728: 0.00%
27017: 0.00%
10212: 0.00%
21385: 0.00%
8633: 0.00%
6408: 0.00%
55150: 0.00%
16748: 0.00%
25386: 0.00%
32133: 0.00%
17562: 0.00%
1587397: 0.00%
10330: 0.00%
48614: 0.00%
16818: 0.00%
14812: 0.00%
31609: 0.00%
3209: 0.00%
9025: 0.00%
20042: 0.00%
650800: 0.00%
4540: 0.00%
8728: 0.00%
4651: 0.00%
74925: 0.00%
20229: 0.00%
7441: 0.00%
13045: 0.00%
28066: 0.00%
3268: 0.00%
16418: 0.00%
16307: 0.00%
5540: 0.00%
889: 0.00%
10437: 0.00%
20777: 0.00%
19680: 0.00%
3455: 0.00%
32884: 0.00%
35813: 0.00%
3746371: 0.00%
9571: 0.00%
4192: 0.00%
11850: 0.00%
39648: 0.00%
16597: 0.00%
26435: 0.00%
8591: 0.00%
3688: 0.00%
5633: 0.00%
15583: 0.00%
66067: 0.00%
23139: 0.00%
335851: 0.00%
34169: 0.00%
9262: 0.00%
19154: 0.00%
11625: 0

3931: 0.00%
16609: 0.00%
5288: 0.00%
11848: 0.00%
6087: 0.00%
3962: 0.00%
7225: 0.00%
4695: 0.00%
16696: 0.00%
3999: 0.00%
4547: 0.00%
3402: 0.00%
9275: 0.00%
22925: 0.00%
8227: 0.00%
6225: 0.00%
8391: 0.00%
21815: 0.00%
2974: 0.00%
20131: 0.00%
13503: 0.00%
10615: 0.00%
8075: 0.00%
28224: 0.00%
10733: 0.00%
10762: 0.00%
5557: 0.00%
5086: 0.00%
7285: 0.00%
3666: 0.00%
16162: 0.00%
160618: 0.00%
3287: 0.00%
3187: 0.00%
21937: 0.00%
8792: 0.00%
8082: 0.00%
12636: 0.00%
8724: 0.00%
51633: 0.00%
6725: 0.00%
3741: 0.00%
12326: 0.00%
28320: 0.00%
14609: 0.00%
52838: 0.00%
15119: 0.00%
3714: 0.00%
3891: 0.00%
11913: 0.00%
5337: 0.00%
15968: 0.00%
19572: 0.00%
28696: 0.00%
144411: 0.00%
4114: 0.00%
2791: 0.00%
13225: 0.00%
11455: 0.00%
646196: 0.00%
19309: 0.00%
4036: 0.00%
3205: 0.00%
5918: 0.00%
22886: 0.00%
2690: 0.00%
6471: 0.00%
41901: 0.00%
29325: 0.00%
95839: 0.00%
8284: 0.00%
18278: 0.00%
13508: 0.00%
23376: 0.00%
42841: 0.00%
18881: 0.00%
5397: 0.00%
3551: 0.00%
3709: 0.00%
16641: 0.0

19578: 0.00%
3242: 0.00%
3612: 0.00%
16751: 0.00%
4168: 0.00%
13713: 0.00%
2587: 0.00%
12138: 0.00%
10300: 0.00%
7502: 0.00%
18445: 0.00%
9891: 0.00%
11601: 0.00%
6089: 0.00%
7106: 0.00%
9005: 0.00%
9730: 0.00%
10347: 0.00%
9497: 0.00%
10886: 0.00%
18282: 0.00%
5149: 0.00%
28198: 0.00%
1495: 0.00%
4701: 0.00%
3318: 0.00%
13920: 0.00%
28344: 0.00%
7644: 0.00%
6200: 0.00%
42836: 0.00%
7385: 0.00%
5132: 0.00%
11067: 0.00%
13392: 0.00%
10054: 0.00%
31244: 0.00%
7427: 0.00%
11764: 0.00%
11574: 0.00%
11588: 0.00%
4432: 0.00%
10204: 0.00%
7880: 0.00%
137562: 0.00%
11887: 0.00%
3219: 0.00%
13728: 0.00%
21494: 0.00%
14789: 0.00%
17149: 0.00%
21374: 0.00%
42785: 0.00%
9877: 0.00%
41144: 0.00%
5265: 0.00%
11509: 0.00%
25128: 0.00%
14892: 0.00%
5100: 0.00%
99068: 0.00%
14089: 0.00%
219837: 0.00%
44023: 0.00%
25209: 0.00%
4539: 0.00%
36055: 0.00%
6437: 0.00%
5764: 0.00%
11153: 0.00%
4307: 0.00%
3610: 0.00%
29026: 0.00%
6219: 0.00%
9889: 0.00%
15388: 0.00%
31812: 0.00%
22793: 0.00%
14588: 0.00%
7087

Column: num_compromised
0: 98.98%
1: 0.77%
2: 0.08%
4: 0.03%
3: 0.03%
6: 0.02%
5: 0.01%
7: 0.00%
8: 0.00%
9: 0.00%
13: 0.00%
23: 0.00%
10: 0.00%
21: 0.00%
371: 0.00%
12: 0.00%
884: 0.00%
31: 0.00%
151: 0.00%
407: 0.00%
452: 0.00%
46: 0.00%
174: 0.00%
1739: 0.00%
16: 0.00%
378: 0.00%
75: 0.00%
37: 0.00%
375: 0.00%
543: 0.00%
198: 0.00%
809: 0.00%
22: 0.00%
456: 0.00%
237: 0.00%
74: 0.00%
691: 0.00%
187: 0.00%
94: 0.00%
349: 0.00%
676: 0.00%
568: 0.00%
258: 0.00%
175: 0.00%
373: 0.00%
14: 0.00%
761: 0.00%
78: 0.00%
1043: 0.00%
177: 0.00%
247: 0.00%
538: 0.00%
102: 0.00%
17: 0.00%
54: 0.00%
217: 0.00%
767: 0.00%
457: 0.00%
520: 0.00%
789: 0.00%
193: 0.00%
83: 0.00%
558: 0.00%
751: 0.00%
157: 0.00%
110: 0.00%
462: 0.00%
107: 0.00%
38: 0.00%
19: 0.00%
756: 0.00%
281: 0.00%
44: 0.00%
622: 0.00%
18: 0.00%
15: 0.00%
716: 0.00%
202: 0.00%
121: 0.00%
682: 0.00%
537: 0.00%
345: 0.00%
166: 0.00%
7479: 0.00%
307: 0.00%
40: 0.00%
405: 0.00%
11: 0.00%

Column: root_shell
0: 99.87%
1: 0.13%

Column: s

193: 0.03%
137: 0.03%
118: 0.03%
166: 0.03%
117: 0.03%
127: 0.03%
120: 0.03%
105: 0.03%
167: 0.03%
192: 0.03%
219: 0.03%
203: 0.03%
190: 0.03%
172: 0.03%
228: 0.03%
165: 0.03%
132: 0.03%
133: 0.03%
220: 0.03%
129: 0.03%
168: 0.03%
121: 0.03%
196: 0.03%
206: 0.03%
135: 0.03%
175: 0.03%
177: 0.03%
116: 0.03%
173: 0.03%
221: 0.03%
218: 0.03%
106: 0.03%
131: 0.03%
169: 0.03%
126: 0.03%
180: 0.03%
227: 0.03%
171: 0.03%
67: 0.03%
163: 0.03%
237: 0.03%
235: 0.03%
182: 0.03%
236: 0.03%
164: 0.03%
71: 0.03%
233: 0.03%
63: 0.03%
212: 0.02%
58: 0.02%
230: 0.02%
239: 0.02%
170: 0.02%
211: 0.02%
207: 0.02%
222: 0.02%
186: 0.02%
205: 0.02%
69: 0.02%
217: 0.02%
215: 0.02%
124: 0.02%
226: 0.02%
209: 0.02%
240: 0.02%
213: 0.02%
210: 0.02%
176: 0.02%
214: 0.02%
185: 0.02%
134: 0.02%
64: 0.02%
234: 0.02%
224: 0.02%
136: 0.02%
188: 0.02%
238: 0.02%
70: 0.02%
216: 0.02%
89: 0.02%
75: 0.02%
232: 0.02%
72: 0.02%
82: 0.02%
85: 0.02%
73: 0.02%
95: 0.02%
81: 0.02%
90: 0.02%
97: 0.02%
77: 0.02%
96: 0.02%
83: 0.0

48: 0.17%
54: 0.17%
46: 0.17%
72: 0.17%
57: 0.17%
56: 0.17%
74: 0.16%
64: 0.16%
67: 0.16%
66: 0.15%
77: 0.15%
68: 0.15%
75: 0.15%
61: 0.15%
69: 0.15%
84: 0.15%
71: 0.15%
70: 0.15%
79: 0.15%
76: 0.15%
65: 0.15%
63: 0.14%
83: 0.14%
78: 0.14%
81: 0.14%
73: 0.14%
88: 0.14%
85: 0.14%
82: 0.13%
108: 0.13%
92: 0.13%
114: 0.13%
104: 0.13%
90: 0.13%
87: 0.13%
99: 0.13%
80: 0.12%
120: 0.12%
96: 0.12%
106: 0.12%
89: 0.12%
125: 0.12%
103: 0.12%
116: 0.12%
91: 0.12%
98: 0.12%
95: 0.12%
86: 0.12%
94: 0.12%
97: 0.12%
128: 0.12%
100: 0.12%
93: 0.12%
109: 0.12%
112: 0.12%
115: 0.12%
113: 0.11%
111: 0.11%
101: 0.11%
119: 0.11%
127: 0.11%
124: 0.11%
102: 0.11%
105: 0.11%
169: 0.11%
144: 0.11%
152: 0.11%
130: 0.11%
118: 0.11%
123: 0.11%
122: 0.11%
136: 0.11%
110: 0.10%
163: 0.10%
129: 0.10%
147: 0.10%
137: 0.10%
107: 0.10%
121: 0.10%
126: 0.10%
142: 0.10%
141: 0.10%
166: 0.10%
117: 0.10%
156: 0.10%
133: 0.10%
168: 0.10%
138: 0.10%
161: 0.10%
150: 0.10%
132: 0.10%
143: 0.10%
135: 0.10%
154: 0.10%
177: 0.10

0.33: 0.08%
0.09: 0.08%
0.17: 0.08%
0.86: 0.08%
0.1: 0.08%
0.25: 0.07%
0.75: 0.07%
0.73: 0.07%
0.53: 0.07%
0.14: 0.07%
0.15: 0.07%
0.78: 0.07%
0.8: 0.07%
0.12: 0.06%
0.29: 0.06%
0.63: 0.06%
0.32: 0.06%
0.31: 0.06%
0.2: 0.06%
0.76: 0.06%
0.36: 0.06%
0.81: 0.06%
0.47: 0.06%
0.56: 0.06%
0.44: 0.06%
0.62: 0.05%
0.98: 0.05%
0.3: 0.05%
0.38: 0.05%
0.27: 0.05%
0.26: 0.05%
0.51: 0.05%
0.55: 0.05%
0.42: 0.05%
0.49: 0.05%
0.16: 0.05%
0.19: 0.05%
0.13: 0.05%
0.35: 0.05%
0.18: 0.05%
0.58: 0.05%
0.6: 0.05%
0.43: 0.05%
0.74: 0.05%
0.4: 0.05%
0.45: 0.05%
0.67: 0.05%
0.24: 0.05%
0.64: 0.05%
0.22: 0.05%
0.61: 0.05%
0.69: 0.05%
0.57: 0.05%
0.41: 0.05%
0.77: 0.04%
0.21: 0.04%
0.23: 0.04%
0.48: 0.04%
0.46: 0.04%
0.79: 0.04%
0.34: 0.04%
0.71: 0.04%
0.65: 0.04%
0.39: 0.04%
0.28: 0.04%
0.99: 0.04%
0.37: 0.04%
0.59: 0.04%
0.54: 0.04%
0.7: 0.04%
0.66: 0.03%
0.72: 0.03%
0.68: 0.03%

Column: dst_host_srv_rerror_rate
0.0: 84.63%
1.0: 10.50%
0.01: 1.10%
0.02: 0.46%
0.03: 0.28%
0.05: 0.28%
0.04: 0.27%
0.98: 0.15%
0

In [8]:
unique_counts = data_train['dst_host_count'].value_counts()
print(unique_counts)

255    74099
1       3119
2       2733
3       1280
4       1198
       ...  
252       77
243       76
226       76
254       70
0          3
Name: dst_host_count, Length: 256, dtype: int64


In [4]:
cat_cols = ['is_host_login','protocol_type','service','flag','land', 'logged_in','is_guest_login', 'level', 'outcome']

In [5]:
def preprocess(dataframe):
    df_num = dataframe.drop(cat_cols, axis=1)
    num_cols = df_num.columns
    scaled_df = Scaling(df_num, num_cols)

    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]

    # No need to convert 'outcome' to binary, keep it as it is

    dataframe = pd.get_dummies(dataframe, columns=['protocol_type', 'service', 'flag'])
    return dataframe

In [6]:
def Scaling(df_num, cols):
    std_scaler = RobustScaler()
    std_scaler_temp = std_scaler.fit_transform(df_num)
    with open('scale.pkl', 'wb') as file:
        pickle.dump(std_scaler, file)
    std_df = pd.DataFrame(std_scaler_temp, columns=cols)
    return std_df

In [7]:
scaled_train = preprocess(data_train)

In [8]:
x = scaled_train.drop(['outcome', 'level'], axis=1).values
y = scaled_train['outcome'].values

In [9]:
one_hot_encoding_mapping = {}
for col in ['protocol_type', 'service', 'flag']:
    one_hot_encoding_mapping[col] = scaled_train.filter(like=col).columns

In [10]:
pca = PCA(n_components=20)
pca = pca.fit(x)
x_reduced = pca.transform(x)
print("Number of original features is {} and of reduced features is {}".format(x.shape[1], x_reduced.shape[1]))

Number of original features is 122 and of reduced features is 20


In [13]:
import pickle
with open('reduced_features.pkl', 'wb') as file:
    pickle.dump(pca, file)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_reduced, y, test_size=0.2, random_state=42)

In [35]:
#print(len(y_train))
for i in range(100):
    print(i, y_train[i])

0 normal
1 neptune
2 neptune
3 normal
4 normal
5 neptune
6 portsweep
7 ipsweep
8 neptune
9 satan
10 neptune
11 normal
12 normal
13 neptune
14 satan
15 smurf
16 normal
17 normal
18 ipsweep
19 neptune
20 normal
21 neptune
22 neptune
23 ipsweep
24 normal
25 normal
26 neptune
27 normal
28 normal
29 normal
30 normal
31 normal
32 neptune
33 neptune
34 portsweep
35 normal
36 satan
37 normal
38 normal
39 normal
40 neptune
41 normal
42 neptune
43 normal
44 normal
45 normal
46 neptune
47 normal
48 normal
49 normal
50 normal
51 normal
52 normal
53 normal
54 neptune
55 normal
56 neptune
57 neptune
58 teardrop
59 neptune
60 neptune
61 neptune
62 neptune
63 neptune
64 neptune
65 normal
66 normal
67 neptune
68 neptune
69 ipsweep
70 neptune
71 normal
72 normal
73 normal
74 neptune
75 neptune
76 ipsweep
77 normal
78 normal
79 normal
80 normal
81 normal
82 neptune
83 portsweep
84 neptune
85 normal
86 neptune
87 normal
88 neptune
89 neptune
90 normal
91 normal
92 neptune
93 portsweep
94 normal
95 normal


In [124]:
# Get unique class labels in the 'outcome' column
unique_labels = scaled_train['outcome'].unique()

# Create a mapping dictionary
outcome_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# Replace 'scaled_train' with your actual DataFrame
scaled_train['outcome'] = scaled_train['outcome'].map(outcome_mapping)

In [125]:
outcome_mapping

{'normal': 0,
 'neptune': 1,
 'warezclient': 2,
 'ipsweep': 3,
 'portsweep': 4,
 'teardrop': 5,
 'nmap': 6,
 'satan': 7,
 'smurf': 8,
 'pod': 9,
 'back': 10,
 'guess_passwd': 11,
 'ftp_write': 12,
 'multihop': 13,
 'rootkit': 14,
 'buffer_overflow': 15,
 'imap': 16,
 'warezmaster': 17,
 'phf': 18,
 'land': 19,
 'loadmodule': 20,
 'spy': 21,
 'perl': 22}

In [19]:
from keras.utils import to_categorical

# Assuming 'y_train' and 'y_test' contain class labels
y_train_encoded = [outcome_mapping[label] for label in y_train]
y_test_encoded = [outcome_mapping[label] for label in y_test]

# Convert encoded labels to one-hot encoded format
y_train_categorical = to_categorical(y_train_encoded, num_classes=len(outcome_mapping))
y_test_categorical = to_categorical(y_test_encoded, num_classes=len(outcome_mapping))

In [20]:
model = Sequential()
model.add(LSTM(units=32, input_shape=(x_train.shape[1], 1)))
model.add(Dense(units=len(outcome_mapping), activation='softmax'))  # Output units equal to the number of classes, with 'softmax' activation
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
history = model.fit(x_train.reshape((x_train.shape[0], x_train.shape[1], 1)), y_train_categorical,
                    epochs=10, batch_size=32,
                    validation_data=(x_test.reshape((x_test.shape[0], x_test.shape[1], 1)), y_test_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

NameError: name 'model' is not defined

In [76]:
scaled_train.head()

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,outcome,level,duration,src_bytes,dst_bytes,wrong_fragment,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,0,0,0,normal,20,0.0,1.619565,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,normal,15,0.0,0.369565,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,neptune,19,0.0,-0.15942,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,normal,21,0.0,0.681159,15.800388,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,normal,21,0.0,0.561594,0.813953,0.0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
#####################
custom_data=pd.DataFrame()
for col in scaled_train.columns:
    if col in input_data.keys():
        custom_data[col]=[input_data[col]]
custom_data

NameError: name 'scaled_train' is not defined

In [104]:
#############################
# Define the list of column names
column_names = [
    'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
    'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp',
    'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard',
    'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs',
    'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest',
    'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001',
    'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link',
    'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns',
    'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u',
    'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private',
    'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net',
    'service_ssh', 'service_sunrpc', 'service_supdup', 'service_systat', 'service_telnet', 'service_tftp_u',
    'service_tim_i', 'service_time', 'service_urh_i', 'service_urp_i', 'service_uucp', 'service_uucp_path',
    'service_vmnet', 'service_whois',
    'flag_OTH', 'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3',
    'flag_SF', 'flag_SH'
]

# Create an empty DataFrame with the specified columns
custom_data_df = pd.DataFrame(columns=column_names)

# Add the initial row of zeros
initial_row = pd.Series(np.zeros(len(column_names)), index=column_names)
custom_data_df = custom_data_df.append(initial_row, ignore_index=True)

# Update 'protocol_type' columns
custom_data_df['protocol_type_'+input_data['protocol_type']]=1

# Update 'service' columns
custom_data_df['service_' + input_data['service']] = 1

# Update 'flag' columns
custom_data_df['flag_' + input_data['flag']] = 1

In [105]:
custom_data_df

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0


In [106]:
#########################################################
custom_data=pd.concat([custom_data, custom_data_df], axis=1)
for col in custom_data.columns:
    print(col)

land
logged_in
is_host_login
is_guest_login
duration
src_bytes
dst_bytes
wrong_fragment
urgent
hot
num_failed_logins
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
protocol_type_icmp
protocol_type_tcp
protocol_type_udp
service_IRC
service_X11
service_Z39_50
service_aol
service_auth
service_bgp
service_courier
service_csnet_ns
service_ctf
service_daytime
service_discard
service_domain
service_domain_u
service_echo
service_eco_i
service_ecr_i
service_efs
service_exec
service_finger
service_ftp
service_ftp_data
service_gopher
service_harvest
service_hostnames
service_http
service_ht

In [107]:
custom_data

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,1,0,0,10,1000,500,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0


In [110]:
custom_data_reduced=pca.transform(custom_data)



In [12]:
import pickle

# Specify the path to the pickled model file
model_file_path = 'model.pkl'  # Replace with the actual file path

# Load the pickled model
with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)

# Now 'loaded_model' contains your previously pickled model

In [121]:
predicted_probabilities = model.predict(custom_data_reduced)



In [13]:
y_pred = model.predict(x_test)



In [28]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
outcome_mapping = {'normal': 0,
 'neptune': 1,
 'warezclient': 2,
 'ipsweep': 3,
 'portsweep': 4,
 'teardrop': 5,
 'nmap': 6,
 'satan': 7,
 'smurf': 8,
 'pod': 9,
 'back': 10,
 'guess_passwd': 11,
 'ftp_write': 12,
 'multihop': 13,
 'rootkit': 14,
 'buffer_overflow': 15,
 'imap': 16,
 'warezmaster': 17,
 'phf': 18,
 'land': 19,
 'loadmodule': 20,
 'spy': 21,
 'perl': 22}

In [15]:
# Convert numerical predictions to class labels for the entire y_pred
y_pred_labels = [list(outcome_mapping.keys())[np.argmax(row)] for row in y_pred]

In [16]:
#from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_labels)
print("Accuracy:", accuracy)

# Generate a classification report
#report = classification_report(y_test, y_pred_labels)
#print("Classification Report:\n", report)


Accuracy: 0.9864655685651915


In [20]:
unique_values, counts = np.unique(y_test, return_counts=True)
unique_counts = dict(zip(unique_values, counts))

In [21]:
for value, count in unique_counts.items():
    print(f"{value}: {count}")

back: 185
buffer_overflow: 9
guess_passwd: 11
imap: 1
ipsweep: 733
land: 3
neptune: 8228
nmap: 313
normal: 13422
perl: 1
phf: 1
pod: 43
portsweep: 573
rootkit: 1
satan: 738
smurf: 534
spy: 1
teardrop: 188
warezclient: 202
warezmaster: 8


In [27]:
y_pred_unique_values, y_pred_counts = np.unique(y_pred_labels, return_counts=True)
y_pred_unique_counts = dict(zip(y_pred_unique_values, y_pred_counts))

In [28]:
for value, count in y_pred_unique_counts.items():
    print(f"{value}: {count}")

back: 170
buffer_overflow: 1
guess_passwd: 12
ipsweep: 709
neptune: 8228
nmap: 308
normal: 13547
pod: 11
portsweep: 561
satan: 725
smurf: 535
teardrop: 188
warezclient: 200


In [122]:
predicted_probabilities

array([[4.6816358e-04, 6.8556212e-02, 3.3962981e-05, 3.0318476e-02,
        3.5976004e-03, 1.2593243e-04, 4.1653213e-01, 3.1434763e-02,
        9.3788065e-02, 3.1283870e-01, 1.1208211e-03, 1.5852530e-02,
        1.8374657e-03, 7.5940569e-03, 1.7142333e-03, 3.8045624e-04,
        1.1319865e-03, 1.4457171e-03, 1.6973961e-04, 5.9851287e-03,
        1.0685424e-03, 3.2326831e-03, 7.7280408e-04]], dtype=float32)

In [127]:
# Get the predicted class label (index with highest probability)
predicted_class_index = np.argmax(predicted_probabilities)

# Inverse mapping to get the class label string
predicted_class = list(outcome_mapping.keys())[predicted_class_index]

In [128]:
predicted_class

'nmap'

In [47]:
for i in data_train.columns:
    print(i, data_train[i][8])

duration 0
protocol_type tcp
service remote_job
flag S0
src_bytes 0
dst_bytes 0
land 0
wrong_fragment 0
urgent 0
hot 0
num_failed_logins 0
logged_in 0
num_compromised 0
root_shell 0
su_attempted 0
num_root 0
num_file_creations 0
num_shells 0
num_access_files 0
num_outbound_cmds 0
is_host_login 0
is_guest_login 0
count 270
srv_count 23
serror_rate 1.0
srv_serror_rate 1.0
rerror_rate 0.0
srv_rerror_rate 0.0
same_srv_rate 0.09
diff_srv_rate 0.05
srv_diff_host_rate 0.0
dst_host_count 255
dst_host_srv_count 23
dst_host_same_srv_rate 0.09
dst_host_diff_srv_rate 0.05
dst_host_same_src_port_rate 0.0
dst_host_srv_diff_host_rate 0.0
dst_host_serror_rate 1.0
dst_host_srv_serror_rate 1.0
dst_host_rerror_rate 0.0
dst_host_srv_rerror_rate 0.0
outcome neptune
level 21


In [48]:
input_data = {
    'duration': 0,
    'protocol_type': 'tcp',
    'service': 'remote_job',
    'flag': 'S0',
    'src_bytes': 0,
    'dst_bytes': 0,
    'land': 0,
    'wrong_fragment': 0,
    'urgent': 0,
    'hot': 0,
    'num_failed_logins': 0,
    'logged_in': 0,
    'num_compromised': 0,
    'root_shell': 0,
    'su_attempted': 0,
    'num_root': 0,
    'num_file_creations': 0,
    'num_shells': 0,
    'num_access_files': 0,
    'num_outbound_cmds': 0,
    'is_host_login': 0,
    'is_guest_login': 0,
    'count': 260,
    'srv_count': 23,
    'serror_rate': 1.0,
    'srv_serror_rate': 1.0,
    'rerror_rate': 0.0,
    'srv_rerror_rate': 0.0,
    'same_srv_rate': 0.09,
    'diff_srv_rate': 0.05,
    'srv_diff_host_rate': 0.0,
    'dst_host_count': 255,
    'dst_host_srv_count': 23,
    'dst_host_same_srv_rate': 0.09,
    'dst_host_diff_srv_rate': 0.05,
    'dst_host_same_src_port_rate': 0.0,
    'dst_host_srv_diff_host_rate': 0.0,
    'dst_host_serror_rate': 1.0,
    'dst_host_srv_serror_rate': 1.0,
    'dst_host_rerror_rate': 0.0,
    'dst_host_srv_rerror_rate': 0.0
}

In [3]:
with open('scale.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

In [4]:
#Function to preprocess input data as per the model training.
def preprocess_input_data(input_data):

    # Columns to be excluded from scaling
    columns_to_exclude = ['is_host_login', 'land', 'logged_in','is_guest_login']
    
    scaled_column_names = [
        'land', 'logged_in', 'is_host_login', 'is_guest_login', 'duration', 'src_bytes', 'dst_bytes', 'wrong_fragment',
        'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
        'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
        'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
        'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier',
        'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u',
        'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp',
        'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784',
        'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell',
        'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm',
        'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u',
        'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private',
        'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net',
        'service_ssh', 'service_sunrpc', 'service_supdup', 'service_systat', 'service_telnet', 'service_tftp_u',
        'service_tim_i', 'service_time', 'service_urh_i', 'service_urp_i', 'service_uucp', 'service_uucp_path',
        'service_vmnet', 'service_whois', 'flag_OTH', 'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0',
        'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH'
    ]
    custom_data=pd.DataFrame()
    for col in scaled_column_names:
        if col in input_data.keys() and col not in columns_to_exclude:
            custom_data[col]=[input_data[col]]
    cols=custom_data.columns
    #print(custom_data.columns)
    scaled_data = scaler.transform(custom_data)
    custom_data = pd.DataFrame(scaled_data, columns=cols)
    

    for col in columns_to_exclude:
        if col in input_data.keys():
            custom_data[col]=[input_data[col]]
    print(custom_data.columns)        
    # Defining the list of column names
    column_names = [
        'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
        'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp',
        'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard',
        'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs',
        'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest',
        'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001',
        'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link',
        'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns',
        'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u',
        'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private',
        'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net',
        'service_ssh', 'service_sunrpc', 'service_supdup', 'service_systat', 'service_telnet', 'service_tftp_u',
        'service_tim_i', 'service_time', 'service_urh_i', 'service_urp_i', 'service_uucp', 'service_uucp_path',
        'service_vmnet', 'service_whois',
        'flag_OTH', 'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3',
        'flag_SF', 'flag_SH'
    ]

    # Creating an empty DataFrame with the specified columns
    custom_data_df = pd.DataFrame(columns=column_names)

    # Adding the initial row of zeros
    initial_row = pd.Series(np.zeros(len(column_names)), index=column_names)
    custom_data_df = custom_data_df.append(initial_row, ignore_index=True)

    # Updating 'protocol_type' columns
    custom_data_df['protocol_type_'+input_data['protocol_type']] = 1

    # Updating 'service' columns
    custom_data_df['service_' + input_data['service']] = 1

    # Updating 'flag' columns
    custom_data_df['flag_' + input_data['flag']] = 1
    
    custom_data=pd.concat([custom_data, custom_data_df], axis=1)
    return custom_data


In [49]:
a=preprocess_input_data(input_data)

Index(['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted',
       'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
       'num_outbound_cmds', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'is_host_login', 'land', 'logged_in',
       'is_guest_login'],
      dtype='object')


  custom_data_df = custom_data_df.append(initial_row, ignore_index=True)


In [6]:
with open('reduced_features.pkl', 'rb') as rf:
    pca = pickle.load(rf)

In [60]:
numpy_array = a.values
numpy_array

array([[0.0, -0.15942028985507245, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7446808510638299, 0.9375, 1.0,
        1.0, 0.0, 0.0, -1.0, 0.8333333333333334, 0.0, 0.0,
        -0.16326530612244897, -0.4421052631578948, 0.42857142857142855,
        0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0, 0, 0, 0, 0.0, 1, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0, 0.0, 0.0,
        0.0]], dtype=object)

In [58]:
x[8]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.15942029,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.81560284,
        0.9375    ,  1.        ,  1.        ,  0.        ,  0.        ,
       -1.        ,  0.83333333,  0.        ,  0.        , -0.16326531,
       -0.44210526,  0.42857143,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [57]:
a_reduced=pca.transform(a)
print("Number of original features is {} and of reduced features is {}".format(a.shape[1], a_reduced.shape[1]))

Number of original features is 122 and of reduced features is 20




In [53]:
a_reduced

array([[-1.67459882e+02, -4.20548943e+01, -2.85157635e+02,
        -2.40304150e-01, -2.70941241e+00, -1.24824313e+00,
         8.14468711e-01, -9.54180420e-01, -2.47978236e-01,
        -1.17150694e+00, -2.90420469e-01, -5.71406335e-02,
        -2.93635786e-02,  4.70250108e-01,  1.68408841e+00,
        -1.34406228e-03,  4.50058638e-01, -4.89056135e-03,
        -4.13121809e-01,  5.46525963e-01]])

In [55]:
x_reduced[8]

array([-1.67619297e+02, -4.20548653e+01, -2.85156575e+02, -2.42221267e-01,
       -2.89591103e+00, -4.74661400e-01,  7.42808673e-01,  3.04383581e-01,
       -3.81601169e-01, -1.96573908e-01,  2.14258482e+00, -8.69079306e-03,
       -1.48881278e-01, -3.27190034e-01,  2.54608593e-03, -4.25505630e-01,
       -2.35732079e-01,  2.51277087e-01, -1.87329618e-01, -3.05154919e-02])

In [8]:
with open('model.pkl', 'rb') as mod:
    model = pickle.load(mod)

In [51]:
predicted_probabilities = model.predict(a_reduced)
predicted_probabilities



array([[9.30834591e-01, 1.68303959e-02, 5.09734991e-06, 4.13081318e-04,
        1.18241701e-02, 4.75534180e-04, 8.17871466e-03, 7.78757455e-03,
        5.20282099e-03, 1.58519186e-02, 6.44619286e-04, 3.26272100e-04,
        1.19691016e-04, 1.76114685e-04, 4.00379213e-04, 2.51895457e-04,
        2.57748528e-04, 5.10573045e-05, 6.80192679e-05, 3.12815064e-05,
        1.35800728e-04, 3.34748402e-05, 9.97570678e-05]], dtype=float32)

In [52]:
outcome_mapping={'normal': 0, 'neptune': 1, 'warezclient': 2, 'ipsweep': 3, 
                     'portsweep': 4,'teardrop': 5,'nmap': 6,'satan': 7,'smurf': 8,
                     'pod': 9,'back': 10,'guess_passwd': 11,'ftp_write': 12,
                     'multihop': 13,'rootkit': 14,'buffer_overflow': 15,'imap': 16,
                     'warezmaster': 17,'phf': 18,'land': 19,'loadmodule': 20,
                     'spy': 21,'perl': 22}
# Get the predicted class label (index with highest probability)
predicted_class_index = np.argmax(predicted_probabilities)

# Inverse mapping to get the class label string
predicted_class = list(outcome_mapping.keys())[predicted_class_index]
predicted_class

'normal'

In [1]:
pip show scikit-learn

Name: scikit-learn
Version: 1.0.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: c:\users\ashfa\anaconda3\lib\site-packages
Requires: threadpoolctl, scipy, joblib, numpy
Required-by: scikit-learn-intelex
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip show tensorflow

Name: tensorflow
Version: 2.13.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: c:\users\ashfa\anaconda3\lib\site-packages
Requires: tensorflow-intel
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Count unique values in the 'column_name' column
unique_counts = data_train['service'].value_counts()
print(unique_counts)

http         40338
private      21853
domain_u      9043
smtp          7313
ftp_data      6860
             ...  
tftp_u           3
http_8001        2
aol              2
harvest          2
http_2784        1
Name: service, Length: 70, dtype: int64


In [8]:
for i in unique_counts.keys():
    print(i, unique_counts[i])

http 40338
private 21853
domain_u 9043
smtp 7313
ftp_data 6860
eco_i 4586
other 4359
ecr_i 3077
telnet 2353
finger 1767
ftp 1754
auth 955
Z39_50 862
uucp 780
courier 734
bgp 710
whois 693
uucp_path 689
iso_tsap 687
time 654
imap4 647
nnsp 630
vmnet 617
urp_i 602
domain 569
ctf 563
csnet_ns 545
supdup 544
discard 538
http_443 530
daytime 521
gopher 518
efs 485
systat 477
link 475
exec 474
hostnames 460
name 451
mtp 439
echo 434
klogin 433
login 429
ldap 410
netbios_dgm 405
sunrpc 381
netbios_ssn 362
netstat 360
netbios_ns 347
ssh 311
kshell 299
nntp 296
pop_3 264
sql_net 245
IRC 187
ntp_u 168
rje 86
remote_job 78
pop_2 78
X11 73
printer 69
shell 65
urh_i 10
tim_i 8
red_i 8
pm_dump 5
tftp_u 3
http_8001 2
aol 2
harvest 2
http_2784 1
