In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
import time

# Load the data
train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')

columns_to_keep = [
    'samesrvrate','srcbytes','dsthostsrvserrorrate','diffsrvrate','count','dstbytes','serrorrate','dsthostsamesrvrate',
 'dsthostserrorrate','srvserrorrate','dsthostdiffsrvrate','dsthostsrvcount','dsthostsamesrcportrate','dsthostsrvrerrorrate','dsthostsrvdiffhostrate',
 'dsthostrerrorrate','rerrorrate','srvdiffhostrate','dsthostcount','lastflag','srvrerrorrate','service','srvcount','flag',
 'loggedin','duration','protocoltype','hot','land','isguestlogin'
]

# Preprocess the data
X = train_data[columns_to_keep]
y = train_data['attack']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



In [None]:
# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers
estimators = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, max_depth=6, learning_rate=0.006)),
    ('ada', AdaBoostClassifier(n_estimators=100, learning_rate=0.01))
]
# Stacking classifier with Logistic Regression as the final estimator
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000)
)

# Define pipelines for stacking and bagging classifiers
stacking_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', stacking_clf)])

# Train stacking classifier
start_time = time.time()
stacking_pipeline.fit(X_train, y_train)
training_time = time.time() - start_time

# Predict on validation set
y_pred_stacking = stacking_pipeline.predict(X_val)

# Calculate F1 score for stacking classifier
f1_stacking = f1_score(y_val, y_pred_stacking)

print(f'Stacking Classifier Training Time: {training_time:.2f} seconds')
print(f'Stacking Classifier F1 Score: {f1_stacking}')

# Bagging classifier with Logistic Regression as base estimator
bagging_clf = BaggingClassifier(
    base_estimator=LogisticRegression(max_iter=1000),
    n_estimators=10,
    random_state=42
)

bagging_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', bagging_clf)])

# Train bagging classifier
start_time = time.time()
bagging_pipeline.fit(X_train, y_train)
training_time = time.time() - start_time

# Predict on validation set
y_pred_bagging = bagging_pipeline.predict(X_val)

# Calculate F1 score for bagging classifier
f1_bagging = f1_score(y_val, y_pred_bagging)

print(f'Bagging Classifier Training Time: {training_time:.2f} seconds')
print(f'Bagging Classifier F1 Score: {f1_bagging}')

# Determine the best model
if f1_stacking > f1_bagging:
    best_model = stacking_pipeline
    print('Stacking Classifier is the best model')
else:
    best_model = bagging_pipeline
    print('Bagging Classifier is the best model')

# Predict on the test set with the best model
test_preds = best_model.predict(test_data[columns_to_keep])

Stacking Classifier Training Time: 68.82 seconds
Stacking Classifier F1 Score: 0.9989406779661018




Bagging Classifier Training Time: 9.49 seconds
Bagging Classifier F1 Score: 0.9999243284146804
Bagging Classifier is the best model


In [None]:
# Create submission file
submission = pd.DataFrame({'attack': test_preds})
submission.to_csv('submission.csv', index=False)