In [None]:
#importing relevant packages
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
import sklearn.metrics as metrics
from sklearn import tree
from sklearn.model_selection import GridSearchCV

In [None]:
#loading the data
kdd=pd.read_csv(r"C:\Users\ASUS\OneDrive\Desktop\DataMiningProject\kddcup99_train_.csv")
display(kdd)

In [None]:
#defining the explanatory variables
X=kdd.copy()
X.drop(columns=['connection_category'],inplace=True)

#defining the explained variable
y=kdd['connection_category'].copy()


#checking if we've got what we wanted
display(y)
display(X)

In [None]:
#Splitting the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
display(x_train)
display(y_train)

In [None]:
#finding the optimal parameters for Decision Tree model
DecisionTree=DecisionTreeClassifier()
parameters={'max_depth':range(1,7),'min_impurity_decrease':[0.1,0.0001,0.001,0.01],'criterion':['gini','entropy'], 
            'min_samples_split':[2,4,6]}
# 
#[0.0001,0.001,0.01,0.1]
grid_DesicionTree=GridSearchCV(estimator=DecisionTree, param_grid=parameters, cv=5,scoring='f1_macro')
grid_DesicionTree.fit(x_train,y_train)

#printing the parameters that give the optimal Decision Tree
print(grid_DesicionTree.best_params_)

In [None]:
##Optimal DecisionTree model
Dec_tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=6, min_impurity_decrease=0.0001, min_samples_split= 4)
Dec_tree_model.fit(x_train, y_train)
print(Dec_tree_model)
# make predictions
expected = y_test
predicted = Dec_tree_model.predict(x_test)

#creating confusion matrix
cm = metrics.confusion_matrix(expected, predicted)
print(cm)

#calculating f1 score for this model
print("f1 score of the DecisionTree model is: ",f1_score(expected, predicted, average='macro'))


In [None]:
#finding the optimal parameters for Random Forest model
rf=RandomForestClassifier(n_estimators=100)
parameters_rf={'max_depth':range(1,7),'min_impurity_decrease':[0.0001,0.001,0.01,0.1],'criterion':['gini','entropy'], 
            'min_samples_split':[2,4,6]}

grid_params_rf=GridSearchCV(estimator=rf, param_grid=parameters_rf, cv=5, scoring='f1_macro')
grid_params_rf.fit(x_train,y_train.values.ravel())

#printing the parameters that give the optimal Random Forest
print(grid_params_rf.best_params_)

In [None]:
##RandomForest model
RandomForest = RandomForestClassifier(criterion='entropy', max_depth=6, min_impurity_decrease=0.0001, 
                                       min_samples_split=4)

    
RandomForest.fit(x_train, y_train.values.ravel())
y_pred = RandomForest.predict(x_test)

#calculating f1 score for this model
print("f1 score of the RandomForest model is: ", f1_score(y_test, y_pred, average='macro'))


In [None]:
#finding the optimal parameters for AdaBoost model
DecisionTree=DecisionTreeClassifier()
AdaBoost = AdaBoostClassifier(base_estimator=DecisionTree)
parameters_AdaBoost={'learning_rate':[0.05, 0.01, 0.1, 0.5, 1],
                     'base_estimator__criterion':['gini','entropy'],
                     "base_estimator__splitter":["best", "random"],"n_estimators": [ 100, 120, 150,200]}

grid_params_Ada=GridSearchCV(estimator=AdaBoost, param_grid=parameters_AdaBoost, cv=5, scoring='f1_macro')
grid_params_Ada.fit(x_train,y_train.values.ravel())

#printing the parameters that give the optimal Decision Tree
print(grid_params_Ada.best_params_)


In [None]:
##AdaBoost optimal model
DecisionTree=DecisionTreeClassifier(criterion='entropy',splitter='best')
opt_AdaBoost = AdaBoostClassifier(base_estimator=DecisionTree,
                                  learning_rate=0.5, n_estimators=200)
opt_AdaBoost.fit(x_train, y_train.values.ravel())
y_pred = opt_AdaBoost.predict(x_test)


In [None]:
#calculating f1 score for this model
print("f1 score of the AdaBoost model is: ",f1_score(y_test, y_pred, average='macro'))

In [None]:
############################################################
####final stage - implying the chosen model on the test data

###loading the test data
final_test=pd.read_csv(r"C:\Users\ASUS\OneDrive\Desktop\DataMiningProject\kddcup99_test_blind_.csv")
display(final_test)

In [None]:
##defining the explenatory variables
final_X=final_test.copy()
final_X.drop(columns=['ID','connection_category'],inplace=True)

display(final_X)

In [None]:
#implying optimal AdaBoost on test data
final_y_pred=opt_AdaBoost.predict(final_X)
print(final_y_pred)

In [None]:
##writing the prediction to the csv file
test_to_submit = pd.read_csv(r"C:\Users\ASUS\OneDrive\Desktop\DataMiningProject\kddcup99_test_blind_to_upload.csv")
test_to_submit['connection_category'] = final_y_pred
display(test_to_submit)
test_to_submit.to_csv(r"C:\Users\ASUS\OneDrive\Desktop\DataMiningProject\pred_submittion_file.csv",index=False)