In [None]:
!pip3 install pandas scikit-learn

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
url = 'UNSW_NB15_training-set.csv'  # Update this path if needed
data = pd.read_csv(url)

# Display dataset info to understand its structure
print(data.info())
print(data.head())

# Strip white spaces from column names
data.columns = data.columns.str.strip()

# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()

# Let's inspect the dataset columns and update the categorical columns
categorical_cols = ['proto', 'service', 'state']  # Adjust based on actual columns

for col in categorical_cols:
    if col in data.columns:
        data[col] = label_encoder.fit_transform(data[col])
    else:
        print(f"Column '{col}' not found in the dataset.")

# Now let's check if 'Label' exists or if there's another target column for classification
if 'Label' in data.columns:
    y = data['Label']
else:
    # Assuming a target column like 'attack_cat' or something relevant for classification
    print("Assuming 'attack_cat' as target variable based on dataset structure.")
    y = data['attack_cat'] if 'attack_cat' in data.columns else None

if y is None:
    print("No target column found for classification.")
else:
    # Drop irrelevant or identifier columns (e.g., 'id') and the target column
    X = data.drop(['id', 'Label', 'attack_cat'], axis=1, errors='ignore')

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Preprocessing complete.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32164 entries, 0 to 32163
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 32164 non-null  int64  
 1   dur                32164 non-null  float64
 2   proto              32164 non-null  object 
 3   service            32164 non-null  object 
 4   state              32164 non-null  object 
 5   spkts              32164 non-null  int64  
 6   dpkts              32164 non-null  int64  
 7   sbytes             32164 non-null  int64  
 8   dbytes             32164 non-null  int64  
 9   rate               32164 non-null  float64
 10  sttl               32164 non-null  int64  
 11  dttl               32164 non-null  int64  
 12  sload              32164 non-null  float64
 13  dload              32164 non-null  float64
 14  sloss              32164 non-null  int64  
 15  dloss              32164 non-null  int64  
 16  sinpkt             321

In [9]:
# Import necessary libraries for modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib
import pickle
from sklearn.preprocessing import StandardScaler

# Assume 'scaler' is your StandardScaler instance used for scaling during training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler to a file
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# 1. Model Selection: Choosing Random Forest Classifier for demonstration
model = RandomForestClassifier(random_state=42)

# 2. Model Training
model.fit(X_train_scaled, y_train)

# 3. Model Evaluation
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 4. Hyperparameter Tuning
# Define parameters for GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='f1', cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters from grid search
print("Best parameters found: ", grid_search.best_params_)

# 5. Using the best model for predictions
best_model = grid_search.best_estimator_
best_y_pred = best_model.predict(X_test_scaled)

# Final Evaluation
print("Final Confusion Matrix:")
print(confusion_matrix(y_test, best_y_pred))

print("\nFinal Classification Report:")
print(classification_report(y_test, best_y_pred))

# 6. Save the model
joblib.dump(best_model, 'cybersecurity_model.pkl')


Confusion Matrix:
[[   0    0   24    6    0    0    0    0    0    0]
 [   0    1    2    3    0    0    0    0    0    0]
 [   0    0  257  337    1    0    0    2    4    0]
 [   0    1  304  804    2    0    0   21    6    0]
 [   0    0   41   30  144    0    0    2    0    0]
 [   0    0    2   20    1 2234    0    0    0    0]
 [   0    0    0    0    0    0 1803    0    0    0]
 [   0    0   46   57    0    0    0  243    0    0]
 [   0    0    3    9    2    1    0    2   13    0]
 [   0    0    1    4    0    0    0    0    0    0]]

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00        30
      Backdoor       0.50      0.17      0.25         6
           DoS       0.38      0.43      0.40       601
      Exploits       0.63      0.71      0.67      1138
       Fuzzers       0.96      0.66      0.78       217
       Generic       1.00      0.99      0.99      2257
        Normal       1.00      1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters found:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Final Confusion Matrix:
[[   0    0   29    1    0    0    0    0    0    0]
 [   0    0    2    4    0    0    0    0    0    0]
 [   0    0  451  144    1    0    0    5    0    0]
 [   0    0  339  782    3    0    0   13    1    0]
 [   0    0   61    9  143    0    0    4    0    0]
 [   0    0    2   22    1 2232    0    0    0    0]
 [   0    0    0    0    0    0 1803    0    0    0]
 [   0    0   55   48    0    0    0  243    0    0]
 [   0    0    1   18    0    0    0    6    5    0]
 [   0    0    0    5    0    0    0    0    0    0]]

Final Classification Report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00        30
      Backdoor       0.00      0.00      0.00         6
           DoS       0.48      0.75      0.59       601
      Exploits       0.76      0.69      0.72      1138
       Fuzzers       0.97      0.66      0.78  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['cybersecurity_model.pkl']