In [1]:
!pip install pandas scikit-learn joblib matplotlib




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [18]:

# Define the paths to the folders
root_dir = "C:\\Users\\akars\\OneDrive\\Desktop\\os model\\archive (1)\\ADFA-IDS_DATASETS\\ADFA-LD\\ADFA-LD"
training_dir = os.path.join(root_dir, "Training_Data_Master")
attack_dir = os.path.join(root_dir, "Attack_Data_Master")
validation_dir = os.path.join(root_dir, "Validation_Data_Master")

# Read the files and labels
def read_files(directory, label):
    files = []
    labels = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r") as file:
            files.append(file.read())
            labels.append(label)
    return files, labels

# Read the training data
training_files, training_labels = read_files(training_dir, 0)

# Read the attack data
attack_files = []
attack_labels = []
for subdir in os.listdir(attack_dir):
    subdir_path = os.path.join(attack_dir, subdir)
    files, labels = read_files(subdir_path, 1)
    attack_files.extend(files)
    attack_labels.extend(labels)

# Read the validation data
validation_files, validation_labels = read_files(validation_dir, 0)

# Combine the data
files = training_files + attack_files + validation_files
labels = training_labels + attack_labels + validation_labels

In [19]:

# Vectorize the data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(files)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)



In [20]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9638958858102435


In [23]:
#inorder to check weather the data was properly loaded or not
print("Training samples:", len(training_files))
print("Attack samples:", len(attack_files))
print("Validation samples:", len(validation_files))
print("Total samples:", len(files))
print("Sample training file content:\n", training_files[0][:300]) 
print("Corresponding label:", training_labels[0])
print("Shape of vectorized data (X):", X.shape)

Training samples: 833
Attack samples: 746
Validation samples: 4372
Total samples: 5951
Sample training file content:
 6 6 63 6 42 120 6 195 120 6 6 114 114 1 1 252 252 252 1 1 1 1 1 1 1 1 1 252 252 252 252 252 252 252 252 252 252 252 252 252 252 252 252 252 252 1 1 252 1 1 1 1 1 1 1 1 1 1 1 1 1 1 252 1 1 1 1 1 1 252 252 252 252 252 252 252 252 252 252 252 1 1 1 1 1 1 1 1 1 1 252 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
Corresponding label: 0
Shape of vectorized data (X): (5951, 167)


In [25]:
import joblib

In [26]:
joblib.dump(clf, 'random_forest_ids.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Saved ")

Saved 
