In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib  # For saving the model


In [3]:
# Load data
data = pd.read_csv('Train_data.csv')

# Create target variable: 1 for 'attack', 0 for 'normal'
y = data.iloc[:, -1].apply(lambda x: 1 if x == 'attack' else 0)
X = data.iloc[:, :-1]  # Features

# Check the shape of X
print("Number of features in training data:", X.shape[1])
print("Feature names:", X.columns.tolist())


Number of features in training data: 41
Feature names: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [4]:
# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Build a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns)
    ]
)


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocess the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [6]:
# Train the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [7]:
# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5039

    accuracy                           1.00      5039
   macro avg       1.00      1.00      1.00      5039
weighted avg       1.00      1.00      1.00      5039

[[5039]]




In [8]:
# Save the trained model
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')


['preprocessor.pkl']

In [22]:
import pandas as pd
import numpy as np
import joblib

# Load the pre-trained model and preprocessor
model = joblib.load('random_forest_model.pkl')
preprocessor = joblib.load('preprocessor.pkl')

# Define column names based on the training dataset
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
]
a=1
# Define a function for predicting input data
def predict_input_data(input_data):
    try:
        # Convert input data from CSV format to a list of features
        input_list = input_data.split(",")

        # Ensure the input data has the correct number of features
        expected_feature_count = len(column_names)
        if len(input_list) != expected_feature_count:
            print(f"Error: Expected {expected_feature_count} features but got {len(input_list)}.")
            return

        # Create a DataFrame with the appropriate column names
        input_df = pd.DataFrame([input_list], columns=column_names)
        
        # Preprocess the input data
        input_data_transformed = preprocessor.transform(input_df)
        
        # Predict using the trained model
        prediction = model.predict(input_data_transformed)

        # Output the prediction result
        if prediction[0]==1:
            a=1
        else:
            a=0
        print("Prediction:", "attack" if prediction[0] == 1 else "normal")

    except ValueError as ve:
        print(f"Value error: {ve}")
    except Exception as e:
        print(f"Error processing input data: {e}")

input_data = input("Enter the data in CSV format (e.g., 0,tcp,ftp_data,...): ")
prediction_result = predict_input_data(input_data)
if a == 1:
    print("No threat")
else:
    print("Threat found")

Enter the data in CSV format (e.g., 0,tcp,ftp_data,...):  0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0,0,1,0,0,30,255,1,0,0.03,0.04,0.03,0.01,0,0.01


Prediction: normal
No threat
