## The analysis is divided into three parts:\n","1. Training SVM with all features\n","2. Comparing different kernels\n","3. Visualizing decision boundaries using selected features\n"

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from datetime import datetime

In [3]:
np.random.seed(42)


In [4]:
df= pd.read_csv(r"C:\Users\17063\Downloads\kddcup99_csv 1.csv")
print(df.head())

print(df.tail())

   duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        181       5450     0   
1         0           tcp    http   SF        239        486     0   
2         0           tcp    http   SF        235       1337     0   
3         0           tcp    http   SF        219       1337     0   
4         0           tcp    http   SF        217       2032     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   9   
1               0       0    0  ...                  19   
2               0       0    0  ...                  29   
3               0       0    0  ...                  39   
4               0       0    0  ...                  49   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     1.0                     0.0   
1                     1.0                     0.0   
2                     1.0                     0.0   
3           

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import os

def examine_dataset(file_path):
    """
    Examine the dataset structure and print relevant information
    """
    try:
        df = pd.read_csv(file_path)
        print("\nDataset Information:")
        print("-" * 50)
        print(f"Number of rows: {len(df)}")
        print(f"Number of columns: {len(df.columns)}")
        print("\nColumn names:")
        for i, col in enumerate(df.columns):
            print(f"{i+1}. {col}")
        print("\nFirst few rows:")
        print(df.head())
        return df
    except Exception as e:
        print(f"Error examining dataset: {str(e)}")
        raise

def load_and_process_kdd99(file_path):
    """
    Load and preprocess the KDD99 dataset with flexible column handling
    """
    try:
        print("Loading and examining dataset...")
        df = examine_dataset(file_path)
        
        # Check if 'attack_type' column exists or find alternative
        attack_col = None
        possible_attack_columns = ['attack_type', 'attack', 'class', 'label', 'type']
        
        for col in df.columns:
            # Check for exact matches
            if col.lower() in possible_attack_columns:
                attack_col = col
                break
            # Check for partial matches
            for possible_col in possible_attack_columns:
                if possible_col in col.lower():
                    attack_col = col
                    break
            if attack_col:
                break
        
        if not attack_col:
            print("\nAvailable columns:", df.columns.tolist())
            raise ValueError("Could not find attack type column. Please specify the correct column name.")
        
        print(f"\nUsing '{attack_col}' as attack type column")
        print("\nUnique values in attack column:", df[attack_col].unique())
        
        # Create binary labels (DoS vs Non-DoS)
        print("\nCreating binary labels...")
        # Adjust attack names based on your actual data
        dos_attacks = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop',
                      'back.', 'land.', 'neptune.', 'pod.', 'smurf.', 'teardrop.',
                      'DOS', 'dos', 'DoS']
        df['binary_label'] = df[attack_col].apply(
            lambda x: 1 if str(x).strip().lower() in [a.lower() for a in dos_attacks] else 0
        )
        
        print(f"\nDoS attacks found: {df['binary_label'].sum()} instances")
        print(f"Non-DoS instances: {len(df) - df['binary_label'].sum()} instances")
        
        # Convert categorical variables to numerical
        print("\nConverting categorical variables...")
        categorical_columns = ['protocol_type', 'service', 'flag']
        # Only use categorical columns that exist in the dataset
        categorical_columns = [col for col in categorical_columns if col in df.columns]
        label_encoders = {}
        
        for column in categorical_columns:
            label_encoders[column] = LabelEncoder()
            df[column] = label_encoders[column].fit_transform(df[column])
        
        # Prepare features and target
        X = df.drop([attack_col, 'binary_label'], axis=1)
        y = df['binary_label']
        
        print("\nFeature matrix shape:", X.shape)
        print("Target vector shape:", y.shape)
        
        # Split the data
        print("\nSplitting data...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Scale the features
        print("Scaling features...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled, y_train, y_test, scaler, label_encoders
        
    except Exception as e:
        print(f"\nError in data processing: {str(e)}")
        raise

def train_and_evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test):
    """
    Train and evaluate the SVM model
    """
    try:
        print("\nTraining SVM model...")
        svm_model = SVC(kernel='rbf', random_state=42)
        svm_model.fit(X_train_scaled, y_train)
        
        print("Making predictions...")
        y_pred = svm_model.predict(X_test_scaled)
        
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        
        return svm_model
        
    except Exception as e:
        print(f"Error in model training: {str(e)}")
        raise

def main():
    try:
        # Use the specific file path
        file_path = r"C:\Users\17063\Downloads\kddcup99_csv 1.csv"
        
        # Load and preprocess the data
        X_train_scaled, X_test_scaled, y_train, y_test, scaler, label_encoders = load_and_process_kdd99(file_path)
        
        # Train and evaluate the model
        model = train_and_evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test)
        
        return model, scaler, label_encoders
        
    except Exception as e:
        print(f"\nProgram failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()
      
    

Loading and examining dataset...

Dataset Information:
--------------------------------------------------
Number of rows: 494020
Number of columns: 42

Column names:
1. duration
2. protocol_type
3. service
4. flag
5. src_bytes
6. dst_bytes
7. land
8. wrong_fragment
9. urgent
10. hot
11. num_failed_logins
12. logged_in
13. lnum_compromised
14. lroot_shell
15. lsu_attempted
16. lnum_root
17. lnum_file_creations
18. lnum_shells
19. lnum_access_files
20. lnum_outbound_cmds
21. is_host_login
22. is_guest_login
23. count
24. srv_count
25. serror_rate
26. srv_serror_rate
27. rerror_rate
28. srv_rerror_rate
29. same_srv_rate
30. diff_srv_rate
31. srv_diff_host_rate
32. dst_host_count
33. dst_host_srv_count
34. dst_host_same_srv_rate
35. dst_host_diff_srv_rate
36. dst_host_same_src_port_rate
37. dst_host_srv_diff_host_rate
38. dst_host_serror_rate
39. dst_host_srv_serror_rate
40. dst_host_rerror_rate
41. dst_host_srv_rerror_rate
42. label

First few rows:
   duration protocol_type service flag 

ValueError: could not convert string to float: 'normal'

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import time
from datetime import datetime

# Load and preprocess data (same as before)
print("Loading dataset...")
cols = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
        'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
        'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type']

df= pd.read_csv(r"C:\Users\17063\Downloads\kddcup99_csv 1.csv")

print("Preprocessing data...")
# Create binary labels
dos_attacks = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop']
df['binary_label'] = df['attack_type'].apply(lambda x: 1 if x in dos_attacks else 0)

# Convert categorical variables
categorical_columns = ['protocol_type', 'service', 'flag']
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Prepare features and target
X = df.drop(['attack_type', 'binary_label'], axis=1)
y = df['binary_label']

# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define kernels to test
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
results = {}

# Train and evaluate models with different kernels
for kernel in kernels:
    print(f"\nTraining SVM with {kernel} kernel...")
    start_time = time.time()
    
    # Initialize and train model
    svm_model = SVC(kernel=kernel, random_state=42)
    svm_model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = svm_model.predict(X_test_scaled)
    
    # Calculate training time
    training_time = time.time() - start_time
    
    # Store results
    results[kernel] = {
        'training_time': training_time,
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }

# Print results
print("\n=== COMPARISON OF SVM KERNELS ===")
for kernel in kernels:
    print(f"\n{'-'*50}")
    print(f"Kernel: {kernel}")
    print(f"Training Time: {results[kernel]['training_time']:.2f} seconds")
    print("\nClassification Report:")
    print(results[kernel]['classification_report'])
    print("\nConfusion Matrix:")
    print(results[kernel]['confusion_matrix'])

Loading dataset...
Preprocessing data...


KeyError: 'attack_type'

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the dataset
print("Loading dataset...")
cols = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
        'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
        'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type']

file_path = r"C:\Users\17063\Downloads\kddcup99_csv 1.csv"
df = pd.read_csv(file_path, names=cols, header=None, low_memory=False)
print("Dataset loaded successfully!")

# Normalize the 'attack_type' values to handle formatting issues
df['attack_type'] = df['attack_type'].str.strip()

# Create binary labels (DoS vs Non-DoS)
dos_attacks = ['back.', 'land.', 'neptune.', 'pod.', 'smurf.', 'teardrop.']
df['binary_label'] = df['attack_type'].apply(lambda x: 1 if x in dos_attacks else 0)

# Select only the two features we want
X = df[['count', 'srv_count']]
y = df['binary_label']

# Ensure no missing values in selected features
X = X.dropna()
y = y.loc[X.index]  # Ensure labels align with cleaned features

# Check class distribution
print("Class distribution before balancing:")
print(y.value_counts())

# Balance the dataset
print("Balancing dataset...")
class_counts = y.value_counts()
min_class_count = class_counts.min()

class_0 = df[df['binary_label'] == 0].sample(n=min_class_count, random_state=42)
class_1 = df[df['binary_label'] == 1].sample(n=min_class_count, random_state=42)

balanced_df = pd.concat([class_0, class_1])

# Update X and y after balancing
X = balanced_df[['count', 'srv_count']]
y = balanced_df['binary_label']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

def plot_decision_boundary(X, y, model, title):
    plt.figure(figsize=(10, 8))
    
    # Create a mesh grid
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    
    # Make predictions for each point in the mesh
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Plot the decision boundary and training points
    plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.coolwarm)
    plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8, edgecolor='k', cmap=plt.cm.coolwarm)
    plt.xlabel('Normalized count')
    plt.ylabel('Normalized srv_count')
    plt.title(title)
    plt.colorbar()

# Train and plot Linear SVM
print("Training Linear SVM...")
linear_svm = SVC(kernel='linear', random_state=42)
linear_svm.fit(X_train, y_train)
plot_decision_boundary(X_scaled, y, linear_svm, 'Linear SVM Decision Boundary')

# Train and plot RBF SVM
print("Training RBF SVM...")
rbf_svm = SVC(kernel='rbf', random_state=42)
rbf_svm.fit(X_train, y_train)
plot_decision_boundary(X_scaled, y, rbf_svm, 'RBF SVM Decision Boundary')

# Print accuracy scores
print("\nLinear SVM accuracy:", linear_svm.score(X_test, y_test))
print("RBF SVM accuracy:", rbf_svm.score(X_test, y_test))

plt.show()



Loading dataset...
Dataset loaded successfully!
Class distribution before balancing:
0    494021
Name: binary_label, dtype: int64
Balancing dataset...


ValueError: a must be greater than 0 unless no samples are taken