In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

####################################### Data preparations before analysis #######################################

df = pd.read_csv("kddcup99_train_.csv")
print(df['connection_category'].value_counts())

df.describe()
names = df.columns

### checking for nulls in the data:
print('Number of nulls in the data',len(df[df.isnull().any(1)]))

df['connection_category'] = df['connection_category'].astype('category')

### checking for columns containing only 0's in the data:
empty = []
for name in names:
    if (df[name] == 0).all() == True:
        empty.append(name)
print('we found that these columns contain only 0s:', empty, ' so we deleted them.')    
df = df.drop(columns=empty)

### Transformations and Feature engineering:

# we thought of various options:
# 1. No feature engineering at all --> files used are: df, X, y
# 2. We made several feature engineering based on our common sense, and saved the results of each new df.

df1 = df
df2 = df
df3 = df

feture1 = df["src_bytes"] + df["dst_bytes"]
feture2 = df["same_srv_rate"] + df["diff_srv_rate"]
feture3 = df["dst_host_same_srv_rate"] + df["dst_host_diff_srv_rate"]
feture4 = df["serror_rate"] + df["rerror_rate"]
feture5 = df["srv_serror_rate"] + df["srv_rerror_rate"]
feture6 = df["dst_host_serror_rate"] + df["dst_host_rerror_rate"]
feture7 = df["dst_host_srv_serror_rate"] + df["dst_host_srv_rerror_rate"]

df1["feture1"] = feture1
df1["feture2"] = feture2
df1["feture3"] = feture3
df1["feture4"] = feture4
df1["feture5"] = feture5
df1["feture6"] = feture6
df1["feture7"] = feture7
df1 = df1.drop(columns=['src_bytes','dst_bytes','same_srv_rate','diff_srv_rate','dst_host_same_srv_rate',
                        'dst_host_diff_srv_rate', 'serror_rate','rerror_rate','srv_serror_rate','srv_rerror_rate',
                        'dst_host_serror_rate','dst_host_rerror_rate','dst_host_srv_serror_rate',
                        'dst_host_srv_rerror_rate'])
                           
df2["feture2"] = feture2
df2["feture3"] = feture3
df2["feture4"] = feture4
df2["feture5"] = feture5
df2["feture6"] = feture6
df2["feture7"] = feture7
df2 = df2.drop(columns=['same_srv_rate','diff_srv_rate','dst_host_same_srv_rate','dst_host_diff_srv_rate',
                        'serror_rate','rerror_rate','srv_serror_rate','srv_rerror_rate',
                        'dst_host_serror_rate','dst_host_rerror_rate','dst_host_srv_serror_rate',
                        'dst_host_srv_rerror_rate'])

df3["feture1"] = feture1
df3= df3.drop(columns=['src_bytes','dst_bytes'])

dos       388043
normal     97565
probe       4117
r2l          563
Name: connection_category, dtype: int64
Number of nulls in the data 0
we found that these columns contain only 0s: ['land', 'num_outbound_cmds', 'is_host_login', 'service_http_2784', 'service_red_i', 'service_tim_i']  so we deleted them.


In [10]:
y = df["connection_category"]
X = df.drop(columns=['connection_category'])
X1 = df1.drop(columns=['connection_category'])
X2 = df2.drop(columns=['connection_category'])
X3 = df3.drop(columns=['connection_category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y, test_size=0.3)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.3)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y, test_size=0.3)

## Scaling the data:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler = StandardScaler()
scaler.fit(X_train1)
X_train1 = scaler.transform(X_train1)
X_test1 = scaler.transform(X_test1)

scaler = StandardScaler()
scaler.fit(X_train2)
X_train2 = scaler.transform(X_train2)
X_test2 = scaler.transform(X_test2)

scaler = StandardScaler()
scaler.fit(X_train3)
X_train3 = scaler.transform(X_train3)
X_test3 = scaler.transform(X_test3)

data = {'Model                 ': [],
    'Train F1 score': [],
    'Test F1 score': [],
    'Test roc-auc Score': []}

row = {}
finalTable = pd.DataFrame(data)

print("---------------- End of Data Processing ----------------")

---------------- End of Data Processing ----------------


In [11]:

####################################### Models Calculation Functions #######################################

# here we created functions that will run every model we used, and print us the results.

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing


def multiclass_roc_auc_score(y_test, y_pred, average="macro"): # We wanted roc-auc score for more quality validation.
    
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

def model_analysis(model_name, model, X_train, y_train, X_test, y_test): # This is the main model analysis function.
    global finalTable
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    results_tr = dict()
    results_tr['f1_score'] = f1_score(y_train, y_train_pred, average='macro')
  
    print('---------------------------')
    print("Train Results Are: \n")
    print("f1 score:")
    print(results_tr['f1_score'])
    print('---------------------------')
    
    results_test = dict()
    y_test_pred = model.predict(X_test)
    confusion_matrix(y_test, y_test_pred)
    results_test['f1_score'] = f1_score(y_test, y_test_pred, average='macro')

    print('---------------------------')
    print("Test Results Are: \n")
    print("f1 score:")
    print(results_test['f1_score'])
    print('---------------------------')
    
    print('---------------------------')
    print('roc-auc score: ', multiclass_roc_auc_score(y_test, y_test_pred))
    print('---------------------------')

    row = {'Model                 ':model_name, 'Train F1 score':results_tr['f1_score'],
           'Test F1 score':results_test['f1_score'], 'Test roc-auc Score':multiclass_roc_auc_score(y_test, y_test_pred)}

    finalTable = finalTable.append(row, ignore_index=True)
    


def best_parameters(model): # This function gives us the best parameters to use for each model.
    
    print('---------------------------')
    print('      Best Estimator     ')
    print('---------------------------')
    print('\n\t{}\n'.format(model.best_estimator_))

    print('---------------------------')
    print('     Best parameters     ')
    print('---------------------------')
    print('\tParameters of best estimator : \n\n\t{}\n'.format(model.best_params_))

    print('---------------------------')
    print('        Best Score       ')
    print('---------------------------')
    print('\n\tAverage Cross Validate scores of best estimator : \n\n\t{}\n'.format(model.best_score_))

    

In [12]:
### Decision Tree - Basic model:

parameters = {'max_depth':[5, 10, 20, 50, 100, 500], 
              'min_samples_split':[5, 10, 100, 500], 
              'min_impurity_decrease':[0.0, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]}
decision_tree = DecisionTreeClassifier(criterion='gini',
                                       splitter='best',class_weight='balanced')
decision_tree_grid = GridSearchCV(decision_tree, param_grid=parameters,
                                  cv=3, verbose=1, n_jobs=-1)

decision_tree_grid_results = model_analysis('Decision Tree         ', decision_tree_grid, X_train, y_train, X_test, y_test)
best_parameters(decision_tree_grid)

print(finalTable)


Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed:  6.1min finished


---------------------------
Train Results Are: 

f1 score:
0.9986248900825009
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.9842185423702909
---------------------------
---------------------------
roc-auc score:  0.9903938490165479
---------------------------
---------------------------
      Best Estimator     
---------------------------

	DecisionTreeClassifier(class_weight='balanced', max_depth=500,
                       min_samples_split=5)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'max_depth': 500, 'min_impurity_decrease': 0.0, 'min_samples_split': 5}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9997202804334298

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394


In [13]:
### Random Forest Classifier - Basic model:
    

parameters = {'criterion':('gini', 'entropy'), 
              'min_samples_split':[2, 4, 6, 8, 10], 
              'min_impurity_decrease':[0.0, 1e-8, 1e-7, 1e-6]}
random_forest = RandomForestClassifier(random_state=0)
random_forest_grid = GridSearchCV(random_forest, param_grid=parameters,
                                  cv=3, verbose=1, n_jobs=-1)

random_forest_grid_results = model_analysis('Random Forest         ', random_forest_grid, X_train, y_train, X_test, y_test)
best_parameters(random_forest_grid)

print(finalTable)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 11.0min finished


---------------------------
Train Results Are: 

f1 score:
1.0
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.9903194784726286
---------------------------
---------------------------
roc-auc score:  0.9907694188878291
---------------------------
---------------------------
      Best Estimator     
---------------------------

	RandomForestClassifier(random_state=0)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'criterion': 'gini', 'min_impurity_decrease': 0.0, 'min_samples_split': 2}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9998572264638012

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394
1  Random Forest                 1.000000       0.990319            0.990769


In [14]:
### Logistic Regression - Basic model:

parameters = {'random_state':[0]}
logistic = LogisticRegression(penalty = 'l2')
logistic_grid = GridSearchCV(logistic, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)

logistic_result = model_analysis('Logistic              ', logistic_grid, X_train, y_train, X_test, y_test)
best_parameters(logistic_grid)

print(finalTable)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.5s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------------
Train Results Are: 

f1 score:
0.9744366199151939
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.9706257015912509
---------------------------
---------------------------
roc-auc score:  0.9867432046451716
---------------------------
---------------------------
      Best Estimator     
---------------------------

	LogisticRegression(random_state=0)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'random_state': 0}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9993036148874275

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394
1  Random Forest                 1.000000       0.990319            0.990769
2  Logistic                      0.974437       0.97

In [15]:
### Gradient Boosting - Basic model:

parameters = {'n_estimators':[50, 100], 'max_depth':[1,2]}

gradient_boost = GradientBoostingClassifier()
gradient_boost_grid = GridSearchCV(gradient_boost, param_grid=parameters,
                                  cv=3, verbose=1, n_jobs=-1)

gradient_boost_grid_results = model_analysis('Gradient Boost        ', gradient_boost_grid, X_train, y_train, X_test, y_test)
best_parameters(gradient_boost_grid)

print(finalTable)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 11.9min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 13.1min finished


---------------------------
Train Results Are: 

f1 score:
0.9625549826341846
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.9565982860043021
---------------------------
---------------------------
roc-auc score:  0.9658824550280654
---------------------------
---------------------------
      Best Estimator     
---------------------------

	GradientBoostingClassifier(max_depth=2)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'max_depth': 2, 'n_estimators': 100}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9993618899966418

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394
1  Random Forest                 1.000000       0.990319            0.990769
2  Logistic                  

In [16]:
### Running the best model (from the basic model) on the datasets we used feature engineering on:

# Random Forest (df_trans1):
random_forest_grid_results = model_analysis('Random Forest df1     ', random_forest_grid, X_train1, y_train1, X_test1, y_test1)
best_parameters(random_forest_grid)

print(finalTable)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 10.9min finished


---------------------------
Train Results Are: 

f1 score:
1.0
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.991546455597026
---------------------------
---------------------------
roc-auc score:  0.9924757935382107
---------------------------
---------------------------
      Best Estimator     
---------------------------

	RandomForestClassifier(criterion='entropy', min_impurity_decrease=1e-08,
                       random_state=0)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'criterion': 'entropy', 'min_impurity_decrease': 1e-08, 'min_samples_split': 2}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9998426579029895

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394
1  Rand

In [18]:
# Random Forest (df_trans2):
random_forest_grid_results = model_analysis('Random Forest df2     ', random_forest_grid, X_train2, y_train2, X_test2, y_test2)
best_parameters(random_forest_grid)

print(finalTable)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 10.7min finished


---------------------------
Train Results Are: 

f1 score:
1.0
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.9935937820885804
---------------------------
---------------------------
roc-auc score:  0.9938578963364089
---------------------------
---------------------------
      Best Estimator     
---------------------------

	RandomForestClassifier(criterion='entropy', min_impurity_decrease=1e-08,
                       random_state=0)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'criterion': 'entropy', 'min_impurity_decrease': 1e-08, 'min_samples_split': 2}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9998339165169

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394
1  Random

In [19]:
# Random Forest (df_trans3):
random_forest_grid_results = model_analysis('Random Forest df3     ', random_forest_grid, X_train3, y_train3, X_test3, y_test3)
best_parameters(random_forest_grid)

print(finalTable)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 10.8min finished


---------------------------
Train Results Are: 

f1 score:
1.0
---------------------------
---------------------------
Test Results Are: 

f1 score:
0.9906459138918643
---------------------------
---------------------------
roc-auc score:  0.9917215055664637
---------------------------
---------------------------
      Best Estimator     
---------------------------

	RandomForestClassifier(criterion='entropy', min_impurity_decrease=1e-08,
                       random_state=0)

---------------------------
     Best parameters     
---------------------------
	Parameters of best estimator : 

	{'criterion': 'entropy', 'min_impurity_decrease': 1e-08, 'min_samples_split': 2}

---------------------------
        Best Score       
---------------------------

	Average Cross Validate scores of best estimator : 

	0.9998076929953728

   Model                   Train F1 score  Test F1 score  Test roc-auc Score
0  Decision Tree                 0.998625       0.984219            0.990394
1  Ran

In [None]:
# Here we tried using PCA to see if we can get a better score:

from sklearn.decomposition import PCA

pca = PCA(n_components=10)
pca.fit(X_train2)
new_X_train = pca.transform(X_train2)
new_X_test = pca.transform(X_test2)

random_forest_grid_results = model_analysis('PCA Random Forest df2 ', random_forest_grid, new_X_train, y_train, new_X_test, y_test)
best_parameters(random_forest_grid)

# The score wasn't better. We won't use PCA.

In [23]:
### After testing all the models, we compared F1-score and saw that
### the best one we found was Random Forest on df2 --> We will use the feature engineering done on that dataset.

### Now we'll run the best model on the entire train dataset, and predict for the test dataset:

df_train = pd.read_csv("kddcup99_train_.csv")
df_test = pd.read_csv("kddcup99_test_blind_.csv")
df_test = df_test.drop(columns=['connection_category', 'ID'])

### checking for nulls in the data:
print('Number of nulls in the data',len(df_train[df_train.isnull().any(1)]))
print('Number of nulls in the data',len(df_test[df_test.isnull().any(1)]))

df_train['connection_category'] = df_train['connection_category'].astype('category')

### checking for columns containing only 0's in the data:
empty = []
for name in names:
    if (df_train[name] == 0).all() == True:
        empty.append(name)
print('we found that these columns contain only 0s:', empty, ' so we deleted them.')    
df_train = df_train.drop(columns=empty)
df_test = df_test.drop(columns=empty) # Because we don't use the 'empty' columns for training, we deleted them from the test df as well.

## Implementing the best feature engineering we got (based on df2):

feture2 = df_train["same_srv_rate"] + df_train["diff_srv_rate"]
feture3 = df_train["dst_host_same_srv_rate"] + df_train["dst_host_diff_srv_rate"]
feture4 = df_train["serror_rate"] + df_train["rerror_rate"]
feture5 = df_train["srv_serror_rate"] + df_train["srv_rerror_rate"]
feture6 = df_train["dst_host_serror_rate"] + df_train["dst_host_rerror_rate"]
feture7 = df_train["dst_host_srv_serror_rate"] + df_train["dst_host_srv_rerror_rate"]

df_train["feture2"] = feture2
df_train["feture3"] = feture3
df_train["feture4"] = feture4
df_train["feture5"] = feture5
df_train["feture6"] = feture6
df_train["feture7"] = feture7
df_train = df_train.drop(columns=['same_srv_rate','diff_srv_rate','dst_host_same_srv_rate','dst_host_diff_srv_rate',
                                   'serror_rate','rerror_rate','srv_serror_rate','srv_rerror_rate',
                                   'dst_host_serror_rate','dst_host_rerror_rate','dst_host_srv_serror_rate','dst_host_srv_rerror_rate'])

feture2 = df_test["same_srv_rate"] + df_test["diff_srv_rate"]
feture3 = df_test["dst_host_same_srv_rate"] + df_test["dst_host_diff_srv_rate"]
feture4 = df_test["serror_rate"] + df_test["rerror_rate"]
feture5 = df_test["srv_serror_rate"] + df_test["srv_rerror_rate"]
feture6 = df_test["dst_host_serror_rate"] + df_test["dst_host_rerror_rate"]
feture7 = df_test["dst_host_srv_serror_rate"] + df_test["dst_host_srv_rerror_rate"]

df_test["feture2"] = feture2
df_test["feture3"] = feture3
df_test["feture4"] = feture4
df_test["feture5"] = feture5
df_test["feture6"] = feture6
df_test["feture7"] = feture7
df_test = df_test.drop(columns=['same_srv_rate','diff_srv_rate','dst_host_same_srv_rate','dst_host_diff_srv_rate',
                                   'serror_rate','rerror_rate','srv_serror_rate','srv_rerror_rate',
                                   'dst_host_serror_rate','dst_host_rerror_rate','dst_host_srv_serror_rate','dst_host_srv_rerror_rate'])

X_train = df_train.drop(columns=['connection_category'])
y_train = df_train['connection_category']
X_test = df_test

## Scaling the data:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Best Parameters we got are: {'criterion': 'entropy', 'min_impurity_decrease': 1e-08, 'min_samples_split': 2}

random_forest = RandomForestClassifier(random_state=0, criterion='entropy', min_samples_split=2, min_impurity_decrease=1e-08)

random_forest.fit(X_train, y_train)
y_test_pred = random_forest.predict(X_test)

print(y_test_pred)



Number of nulls in the data 0
Number of nulls in the data 0
we found that these columns contain only 0s: ['land', 'num_outbound_cmds', 'is_host_login', 'service_http_2784', 'service_red_i', 'service_tim_i']  so we deleted them.
['dos' 'normal' 'dos' ... 'r2l' 'r2l' 'r2l']


In [24]:
import pandas as pd

prediction = pd.DataFrame(y_test_pred)
prediction = prediction.rename(columns={0: 'connection_category'})
print(prediction)
prediction.to_csv('predict.csv', index=True)

       connection_category
0                      dos
1                   normal
2                      dos
3                      dos
4                      dos
...                    ...
490283                 r2l
490284                 r2l
490285                 r2l
490286                 r2l
490287                 r2l

[490288 rows x 1 columns]


In [25]:
print(prediction['connection_category'].value_counts())

dos       389141
normal     96572
probe       4037
r2l          538
Name: connection_category, dtype: int64
