In [26]:
# Install required packages
!pip install google-generativeai pandas numpy joblib scikit-learn xgboost



In [27]:
import os
import random
import json
import pandas as pd
import numpy as np
import joblib
import pickle
from datetime import datetime
import google.generativeai as genai
from typing import Dict, Any, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set your API key
GOOGLE_API_KEY = "AIzaSyC8BD4pCVWHt1mD5fXzZtVzr4n6wbc3CAM"  # Replace with your actual API key
genai.configure(api_key=GOOGLE_API_KEY)
#check whether api is working by sending random info
# Test if the Gemini API is working
try:
    model = genai.GenerativeModel('gemini-2.0-flash')
    response = model.generate_content("Hello, Gemini! Can you hear me?")
    print("Gemini API test successful. Response:", response.text)
except Exception as e:
    print(f"Error connecting to Gemini API: {e}")


Gemini API test successful. Response: Yes, I can hear you! How can I help you today?



In [28]:
def load_model(model_name):
    """Load the specified model from the models directory."""
    model_path = None

    # Check if model exists as .joblib file
    joblib_path = f"models/{model_name}.joblib"
    if os.path.exists(joblib_path):
        model_path = joblib_path
        model = joblib.load(model_path)

    # Check if model exists as .pkl file
    if model_path is None:
        pkl_path = f"models/{model_name}.pkl"
        if os.path.exists(pkl_path):
            model_path = pkl_path
            with open(pkl_path, 'rb') as f:
                model = pickle.load(f)

    # If model doesn't exist, raise an error
    if model_path is None:
        raise FileNotFoundError(f"Model file for {model_name} not found in models directory")

    # Return the model and its feature names
    return model, MODEL_INFO[model_name]['feature_names']

In [29]:
# Define model information (features required by each model)
MODEL_INFO = {
    'apache_model': {
        'feature_names': ['Message', 'hour', 'minute'],
        'is_pipeline': True
    },
    'linux_log': {
        'feature_names': ['combined_text', 'hour', 'minute'],
        'is_pipeline': True
    },
    'server_log': {
        'feature_names': ['Duration', 'Packets', 'Flows', 'Src Pt', 'Dst Pt', 'Bytes_num', 'hour', 'Proto'],
        'is_pipeline': True
    },
    'ssh_login': {
        'feature_names': ['hour', 'minute', 'day_of_week', 'user_encoded', 'password_encoded'],
        'is_pipeline': False
    },
    'weblog': {
        'feature_names': ['Request', 'Method', 'hour', 'minute'],
        'is_pipeline': True
    }
}

# Print available models and their required features
for model_name, info in MODEL_INFO.items():
    print(f"\n{model_name}:")
    print(f"  Features: {info['feature_names']}")


apache_model:
  Features: ['Message', 'hour', 'minute']

linux_log:
  Features: ['combined_text', 'hour', 'minute']

server_log:
  Features: ['Duration', 'Packets', 'Flows', 'Src Pt', 'Dst Pt', 'Bytes_num', 'hour', 'Proto']

ssh_login:
  Features: ['hour', 'minute', 'day_of_week', 'user_encoded', 'password_encoded']

weblog:
  Features: ['Request', 'Method', 'hour', 'minute']


In [30]:
def generate_log_with_gemini(model_name):
    """Use Gemini API to generate a log message for the specified model type."""
    # Prepare prompts based on the model type with specific format instructions
    prompts = {
        'apache_model': """Generate only ONE realistic Apache web server log entry in this format:
IP_ADDRESS - - [DAY/MONTH/YEAR:HOUR:MINUTE:SECOND +TIMEZONE] "METHOD /PATH HTTP/1.1" STATUS_CODE BYTES_SENT "REFERER" "USER_AGENT"

Example: 192.168.1.1 - - [25/Mar/2023:10:15:23 +0000] "GET /index.html HTTP/1.1" 200 1024 "http://example.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"

The log should include:
- A timestamp with hour and minute
- HTTP request details

IMPORTANT: Generate only a SINGLE log line, not multiple entries.""",

        'linux_log': """Generate only ONE realistic Linux system log entry in this format:
Month Day HH:MM:SS HOSTNAME SERVICE[PID]: MESSAGE

Example: Mar 25 10:15:23 server kernel[1234]: CPU temperature above threshold, throttling CPU

The log should include:
- A timestamp with hour and minute
- A service name and message

IMPORTANT: Generate only a SINGLE log line, not multiple entries.""",

        'server_log': """Generate only ONE realistic server performance log with these specific metrics:
Duration: NUMBER Packets: NUMBER Flows: NUMBER Src Port: NUMBER Dst Port: NUMBER Bytes: NUMBER Protocol: PROTOCOL_NAME

Example: Client 192.168.1.10 connected to server - Duration: 145 Packets: 23 Flows: 3 Src Port: 34567 Dst Port: 80 Bytes: 45678 Protocol: TCP

The log must include exact numeric values for:
- Duration, Packets, Flows, Src Port (Src Pt), Dst Port (Dst Pt), Bytes
- Protocol should be one of: TCP, UDP, or ICMP
- Include a timestamp with hour

IMPORTANT: Generate only a SINGLE log line, not multiple entries.""",

        'ssh_login': """Generate only ONE realistic SSH login attempt log in this format:
Month Day HH:MM:SS HOSTNAME sshd[PID]: MESSAGE

Example: Mar 25 10:15:23 server sshd[1234]: Failed password for invalid user admin from 192.168.1.100 port 22 ssh2

The log should include:
- A timestamp with hour and minute
- Day of week information
- User information
- Password status (valid/invalid)

IMPORTANT: Generate only a SINGLE log line, not multiple entries.""",

        'weblog': """Generate only ONE realistic web application log entry in this format:
[TIMESTAMP] "METHOD /PATH HTTP/1.1" STATUS_CODE RESPONSE_TIME "REFERER" IP_ADDRESS USER_ID

Example: [2023-03-25 10:15:23] "GET /login HTTP/1.1" 200 5.2ms "http://example.com" 192.168.1.100 user123

The log must include:
- HTTP method (GET, POST, PUT, DELETE)
- URL path
- A timestamp with hour and minute

IMPORTANT: Generate only a SINGLE log line, not multiple entries."""
    }

    # Get the appropriate prompt
    prompt = prompts.get(model_name, f"Generate a single realistic {model_name} log entry.")

    # Additional instructions for better output
    prompt += "\n\nIMPORTANT: Return ONLY ONE raw log text without any additional explanation, markdown formatting, or quotes. Do not generate multiple log entries."

    # Call the Gemini API
    model = genai.GenerativeModel('gemini-2.0-flash')
    response = model.generate_content(prompt)

    # Extract just the log text
    log_text = response.text.strip()
    if log_text.startswith('```') and log_text.endswith('```'):
        log_text = log_text.strip('`').strip()

    # Get the first line if multiple lines are returned
    log_text = log_text.split('\n')[0]

    return log_text

In [31]:
def parse_log_to_features(log_text, model_name):
    """Parse a log message into the features needed by the specified model."""
    # Use Gemini to extract structured data from the log text
    features = MODEL_INFO[model_name]['feature_names']

    # If it's a server log, try to extract features directly with regex first
    if model_name == 'server_log':
        import re
        try:
            # Extract hour
            hour = datetime.now().hour  # Default
            time_match = re.search(r'(\d{1,2}):\d{1,2}:\d{1,2}', log_text)
            if time_match:
                hour = int(time_match.group(1))

            # Extract performance metrics
            duration = re.search(r'Duration:\s*(\d+)', log_text)
            packets = re.search(r'Packets:\s*(\d+)', log_text)
            flows = re.search(r'Flows:\s*(\d+)', log_text)
            src_pt = re.search(r'Src\s*P[or]*t:\s*(\d+)', log_text)
            dst_pt = re.search(r'Dst\s*P[or]*t:\s*(\d+)', log_text)
            bytes_num = re.search(r'Bytes:\s*(\d+)', log_text)
            proto = re.search(r'Protocol:\s*(\w+)', log_text)

            if all([duration, packets, flows, src_pt, dst_pt, bytes_num, proto]):
                return {
                    'Duration': int(duration.group(1)),
                    'Packets': int(packets.group(1)),
                    'Flows': int(flows.group(1)),
                    'Src Pt': int(src_pt.group(1)),
                    'Dst Pt': int(dst_pt.group(1)),
                    'Bytes_num': int(bytes_num.group(1)),
                    'hour': hour,
                    'Proto': proto.group(1)
                }
        except Exception as e:
            print(f"Error in direct extraction: {e} - falling back to Gemini")

    # If direct extraction failed or for other model types, use Gemini
    feature_explanations = {
        'apache_model': """
- 'Message': The HTTP request part (e.g., "GET /index.html HTTP/1.1")
- 'hour': Hour extracted from timestamp (0-23)
- 'minute': Minute extracted from timestamp (0-59)
        """,

        'linux_log': """
- 'combined_text': Combine the service name and message text
- 'hour': Hour extracted from timestamp (0-23)
- 'minute': Minute extracted from timestamp (0-59)
        """,

        'server_log': """
- 'Duration': Numeric value after "Duration:" (must be an integer)
- 'Packets': Numeric value after "Packets:" (must be an integer)
- 'Flows': Numeric value after "Flows:" (must be an integer)
- 'Src Pt': Numeric value of source port (must be an integer)
- 'Dst Pt': Numeric value of destination port (must be an integer)
- 'Bytes_num': Numeric value of bytes (must be an integer)
- 'hour': Hour extracted from timestamp (must be an integer 0-23)
- 'Proto': Protocol name (TCP, UDP, or ICMP) (must be a string)
        """,

        'ssh_login': """
- 'hour': Hour extracted from timestamp (0-23) (must be an integer)
- 'minute': Minute extracted from timestamp (0-59) (must be an integer)
- 'day_of_week': Day of week as number (0=Monday, 1=Tuesday, ..., 6=Sunday) (must be an integer)
- 'user_encoded': 0 for 'root', 1 for 'admin', 2 for any other user (must be an integer)
- 'password_encoded': 0 for valid password, 1 for invalid password, 2 for unknown (must be an integer)
        """,

        'weblog': """
- 'Request': The URL path portion (e.g., "/login", "/profile", etc.) (must be a string)
- 'Method': The HTTP method (GET, POST, PUT, DELETE) (must be a string)
- 'hour': Hour extracted from timestamp (0-23) (must be an integer)
- 'minute': Minute extracted from timestamp (0-59) (must be an integer)
        """
    }

    explanation = feature_explanations.get(model_name, "")

    prompt = f"""
    Parse the following log message and extract exactly these features: {features}.

    Log: {log_text}

    Feature details:{explanation}

    Return ONLY a valid JSON object with the feature names as keys matching these exact requirements:
    1. Include ONLY the specified feature names as keys
    2. For numeric features, use integer values (not strings)
    3. For text features, use strings
    4. If you can't extract a feature, use reasonable defaults

    Example output for server_log:
    {{"Duration": 145, "Packets": 23, "Flows": 3, "Src Pt": 34567, "Dst Pt": 80, "Bytes_num": 45678, "hour": 10, "Proto": "TCP"}}
    """

    # Call the Gemini API
    model = genai.GenerativeModel('gemini-2.0-flash')
    response = model.generate_content(prompt)

    # Parse the JSON response
    try:
        # Debug information
        print(f"Gemini response for feature extraction: {response.text.strip()}")

        # Clean up the response to ensure it's valid JSON
        clean_text = response.text.strip()
        # Remove any markdown backticks if present
        if clean_text.startswith('```') and clean_text.endswith('```'):
            clean_text = clean_text.strip('`').strip()
        # Remove json prefix if present
        if clean_text.startswith('json') or clean_text.startswith('JSON'):
            clean_text = clean_text[4:].strip()

        features_dict = json.loads(clean_text)

        # Verify all required features are present
        for feature in features:
            if feature not in features_dict:
                # Add default values for missing features
                if feature in ['hour', 'minute', 'day_of_week', 'user_encoded', 'password_encoded']:
                    features_dict[feature] = 0
                elif feature in ['Duration', 'Packets', 'Flows', 'Src Pt', 'Dst Pt', 'Bytes_num']:
                    features_dict[feature] = 100  # Default value
                elif feature == 'Proto':
                    features_dict[feature] = 'TCP'
                else:
                    features_dict[feature] = ''

        # Ensure all numeric values are actually integers
        for feature in ['hour', 'minute', 'day_of_week', 'user_encoded', 'password_encoded',
                       'Duration', 'Packets', 'Flows', 'Src Pt', 'Dst Pt', 'Bytes_num']:
            if feature in features_dict:
                try:
                    features_dict[feature] = int(features_dict[feature])
                except (ValueError, TypeError):
                    # If conversion fails, set default
                    features_dict[feature] = 0 if feature in ['hour', 'minute', 'day_of_week', 'user_encoded', 'password_encoded'] else 100

        return features_dict
    except Exception as e:
        print(f"Error in JSON parsing: {e}")
        # Create a basic dictionary with default values
        now = datetime.now()
        defaults = {
            'Message': log_text[:50],
            'combined_text': log_text[:50],
            'Request': '/index.html',
            'Method': 'GET',
            'hour': now.hour,
            'minute': now.minute,
            'day_of_week': now.weekday(),
            'user_encoded': 2,  # Unknown
            'password_encoded': 2,  # Unknown
            'Duration': 100,
            'Packets': 10,
            'Flows': 1,
            'Src Pt': 12345,
            'Dst Pt': 80,
            'Bytes_num': 1024,
            'Proto': 'TCP'
        }
        # Filter to only include the features needed by this model
        return {k: defaults.get(k, 0) for k in features}

In [32]:
def process_random_log():
    """Select a random model, generate a log, and classify it."""
    # 1. Randomly select a model
    model_names = list(MODEL_INFO.keys())
    selected_model = random.choice(model_names)
    print(f"\n===== Selected Model: {selected_model} =====")

    # 2. Generate a log message using Gemini
    print("\nGenerating log message...")
    log_text = generate_log_with_gemini(selected_model)
    print(f"\nGenerated log:\n{log_text}")

    # 3. Parse the log to extract features
    print("\nExtracting features...")
    features = parse_log_to_features(log_text, selected_model)
    print(f"Extracted features: {features}")

    # 4. Load the model
    print("\nLoading model...")
    model, feature_names = load_model(selected_model)

    # 5. Convert features to DataFrame
    print("\nPreparing features for classification...")
    features_df = features_to_dataframe(features, feature_names)
    print("Features DataFrame:")
    print(features_df)

    # 6. Classify the log
    print("\nClassifying log...")
    try:
        # Make prediction
        prediction = model.predict(features_df)[0]

        # Get probability if available
        probabilities = {}
        if hasattr(model, 'predict_proba'):
            try:
                probs = model.predict_proba(features_df)[0]
                classes = model.classes_
                probabilities = {str(classes[i]): float(probs[i]) for i in range(len(classes))}
            except Exception as e:
                print(f"Could not get probabilities: {e}")

        # 7. Display the result
        print("\n===== Classification Result =====")
        print(f"Classification: {prediction}")
        print(f"Interpretation: {'Suspicious/Issue detected' if str(prediction) == '1' else 'Normal log'}")
        if probabilities:
            print("Probabilities:")
            for cls, prob in probabilities.items():
                print(f"  - Class {cls}: {prob:.4f}")

        # Store the classification category
        category = "suspicious" if str(prediction) == "1" else "normal"

        return {
            "model": selected_model,
            "log": log_text,
            "features": features,
            "prediction": str(prediction),
            "category": category,
            "probabilities": probabilities
        }
    except Exception as e:
        print(f"\nError during classification: {e}")
        return {
            "model": selected_model,
            "log": log_text,
            "features": features,
            "error": str(e)
        }

In [33]:
# Generate and classify a random log
result = process_random_log()


===== Selected Model: linux_log =====

Generating log message...

Generated log:
Oct 26 14:32:17 fileserver smbd[4567]: User 'johndoe' authentication failed for service 'data'

Extracting features...
Gemini response for feature extraction: ```json
{
    "combined_text": "data authentication failed for service 'data'",
    "hour": 14,
    "minute": 32
}
```
Extracted features: {'combined_text': "data authentication failed for service 'data'", 'hour': 14, 'minute': 32}

Loading model...

Preparing features for classification...
Features DataFrame:
                                   combined_text  hour  minute
0  data authentication failed for service 'data'    14      32

Classifying log...

===== Classification Result =====
Classification: 1
Interpretation: Suspicious/Issue detected
Probabilities:
  - Class 0: 0.0087
  - Class 1: 0.9913
