In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [4]:
# Load the dataset
train_df = pd.read_csv("C:/Users/deepi/Downloads/Train_Data.csv")
test_df = pd.read_csv("C:/Users/deepi/Downloads/Test_Data.csv")


In [5]:
# Data Exploration
print(train_df.head())
print(train_df.info())
print(test_df.head())
print(test_df.info())

   duration protocoltype      service flag  srcbytes  dstbytes  land  \
0         0          tcp  netbios_dgm  REJ         0         0     0   
1         0          tcp         smtp   SF      1239       400     0   
2         0          tcp         http   SF       222       945     0   
3         0          tcp         http   SF       235      1380     0   
4         0          tcp    uucp_path  REJ         0         0     0   

   wrongfragment  urgent  hot  ...  dsthostsamesrvrate  dsthostdiffsrvrate  \
0              0       0    0  ...                0.06                0.06   
1              0       0    0  ...                0.45                0.04   
2              0       0    0  ...                1.00                0.00   
3              0       0    0  ...                1.00                0.00   
4              0       0    0  ...                0.01                0.08   

   dsthostsamesrcportrate  dsthostsrvdiffhostrate  dsthostserrorrate  \
0                    0.00 

In [6]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ['protocoltype', 'service', 'flag']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    test_df[col] = label_encoders[col].transform(test_df[col])


In [7]:
# Separate features and target variable
X = train_df.drop('attack', axis=1)
y = train_df['attack']

In [8]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [10]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

In [11]:
# Evaluate the model
y_pred = model.predict(X_val_scaled)
print("Validation Accuracy:", model.score(X_val_scaled, y_val))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred))

Validation Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10762
           1       1.00      1.00      1.00      6607

    accuracy                           1.00     17369
   macro avg       1.00      1.00      1.00     17369
weighted avg       1.00      1.00      1.00     17369

ROC AUC Score: 1.0


In [12]:
# Make predictions on the test set
X_test_scaled = scaler.transform(test_df)
test_predictions = model.predict(X_test_scaled)

In [14]:
# Prepare the submission file
submission = pd.DataFrame({'id': test_df.index, 'attack': test_predictions})
submission['attack'] = submission['attack'].map({0: 'normal', 1: 'neptune'})
submission.to_csv('submission.csv', index=False)