In [1]:
# Necessary Libraries

import pandas as pd
from datetime import datetime ,timedelta
from collections import defaultdict, Counter
import random
import re
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
# Generate Modal Apache Logs

def generate_http_log(start_date="2017-11-29", end_date="2017-12-10", num_rows=1000):
     # Convert date range
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    total_seconds = int((end - start).total_seconds())

    
    # Generate random timestamps in that day
    timestamps = [
        "[" + (start + timedelta(seconds=random.randint(0, total_seconds))).strftime("%d/%b/%Y:%H:%M:%S") + "]"
        for _ in range(num_rows)
    ]

    
    # Simulate IPs (10.x.x.x range)
    ips = [f"10.{random.randint(120,140)}.{random.randint(0,5)}.{random.randint(1,255)}"
           for _ in range(num_rows)]
    
    # HTTP methods, URLs, and versions
    methods = ["GET", "POST", "HEAD"]
    urls = [
        "/login.php", "/home.php", "/process.php", "/index.html",
        "/contact.html", "/products.html", "/api/data", "/js/script.js",
        "/cs/index.html", "/cs/research.html", "/cgi-bin/formmail.cgi"
    ]
    http_versions = ["HTTP/1.1", "HTTP/2"]
    requests = [f"{random.choice(methods)} {random.choice(urls)} {random.choice(http_versions)}"
                for _ in range(num_rows)]
    
    # Status codes
    status_codes = [200, 301, 302, 404, 500]
    statuses = [random.choice(status_codes) for _ in range(num_rows)]
    
    # User agents
    user_agents = [
        "Mozilla/5.0", "curl/7.58.0", "PostmanRuntime/7.28.0",
        "Googlebot/2.1", "Wget/1.20.3", "Safari/537.36",
        "Chrome/120.0", "Edge/18.18363"
    ]
    user_agent_list = [random.choice(user_agents) for _ in range(num_rows)]
    
    # Build DataFrame
    logs = pd.DataFrame({
        "IP": ips,
        "Time": timestamps,
        "URL": requests,
        "Status": statuses,
        "User_Agent": user_agent_list
    })
    
    return logs

# Generate logs across multiple days
web_logs = generate_http_log(start_date="2017-11-29", end_date="2017-12-10", num_rows=150)

# Show preview
print(web_logs.head())

# Save to CSV
web_logs.to_csv("weblog.csv", index=False)
print("✅ weblog saved as weblog.csv")


             IP                    Time                               URL  \
0   10.135.3.84  [03/Dec/2017:21:24:02]  GET /cgi-bin/formmail.cgi HTTP/2   
1  10.130.1.113  [29/Nov/2017:19:18:42]        GET /contact.html HTTP/1.1   
2   10.129.2.31  [02/Dec/2017:21:22:50]      POST /products.html HTTP/1.1   
3  10.126.3.169  [30/Nov/2017:08:16:26]             HEAD /home.php HTTP/2   
4  10.134.3.183  [03/Dec/2017:10:29:13]        POST /products.html HTTP/2   

   Status             User_Agent  
0     301  PostmanRuntime/7.28.0  
1     500            curl/7.58.0  
2     500          Edge/18.18363  
3     302          Edge/18.18363  
4     301          Safari/537.36  
✅ weblog saved as weblog.csv


In [2]:
#Load the Logs
logs =("csic_database.csv")
print(f"\nLoading the Web logs from {logs}")
df=pd.read_csv(logs)


Loading the Web logs from csic_database.csv


In [5]:

print("\nScheme Overview:\n",df.info)
# Check missing values
print(df.isnull().sum())

# Distribution of labels
print(df["classification"].value_counts())



Scheme Overview:
 <bound method DataFrame.info of       Unnamed: 0 Method                                         User-Agent  \
0         Normal    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
1         Normal    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
2         Normal   POST  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
3         Normal    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
4         Normal   POST  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
...          ...    ...                                                ...   
61060  Anomalous    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
61061  Anomalous   POST  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
61062  Anomalous    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
61063  Anomalous    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   
61064  Anomalous    GET  Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...   

         Pra

In [12]:
# Actual column names
print(df.columns)

# Use the correct column (replace "URL" with yours if needed)
df["url_length"] = df["URL"].str.len()
df["num_question_marks"] = df["URL"].str.count(r"\?")
df["num_equals"] = df["URL"].str.count("=")
df["num_percent"] = df["URL"].str.count("%")
df["num_slash"] = df["URL"].str.count("/")
df["num_special"] = df["URL"].str.count(r"[@#$&]")

# Handle missing values
if "content_length" in df.columns:
    df["content_length"] = df["content_length"].fillna(0)

# Drop unused
df = df.drop(columns=["cookie"], errors="ignore")


Index(['Unnamed: 0', 'Method', 'User-Agent', 'Pragma', 'Cache-Control',
       'Accept', 'Accept-encoding', 'Accept-charset', 'language', 'host',
       'cookie', 'content-type', 'connection', 'lenght', 'content',
       'classification', 'URL', 'url_length', 'num_question_marks',
       'num_equals', 'num_percent', 'num_slash', 'num_special'],
      dtype='object')


In [11]:
from sklearn.preprocessing import LabelEncoder

# Encode method, host, etc.
for col in ["method", "host", "user_agent"]:
    if col in df.columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))


In [11]:
# Train the Isolation Forest model
# Isolation Forest learns what "normal" looks like and finds anomalies

model = IsolationForest(
    n_estimators=200,     # number of trees
    contamination=0.05,   # assume ~5% anomalies (you can adjust)
    random_state=42
)

# Build a pipeline: preprocessing -> anomaly detection
pipe = Pipeline([("pre", pre), ("model", model)])

# Train the model
pipe.fit(feat)


In [13]:
# Score each request
# Higher anomaly_score = more unusual

# Get anomaly scores
scores = pipe.named_steps["model"].score_samples(pipe.named_steps["pre"].transform(feat))

# Negative sign so higher = more anomalous
df["anomaly_score"] = -scores

# Auto threshold based on quantile
threshold = df["anomaly_score"].quantile(0.95)

# Create anomaly flag
df["is_anomaly"] = df["anomaly_score"].apply(lambda x: -1 if x >= threshold else 1)

# Sort logs by anomaly score
df = df.sort_values("anomaly_score", ascending=False)

# Show top suspicious rows
df[["IP","URL","Status","User_Agent","anomaly_score","is_anomaly"]].head(10)


Unnamed: 0,IP,URL,Status,User_Agent,anomaly_score,is_anomaly
0,10.135.3.84,GET /cgi-bin/formmail.cgi HTTP/2,301,PostmanRuntime/7.28.0,0.60604,-1
90,10.131.2.79,GET /api/data HTTP/1.1,200,Edge/18.18363,0.599767,-1
104,10.121.2.108,HEAD /api/data HTTP/2,302,PostmanRuntime/7.28.0,0.599034,-1
37,10.137.3.51,GET /contact.html HTTP/1.1,302,Googlebot/2.1,0.596833,-1
92,10.124.2.29,GET /cs/research.html HTTP/1.1,302,Chrome/120.0,0.59622,-1
108,10.120.0.200,HEAD /js/script.js HTTP/1.1,500,Edge/18.18363,0.586256,-1
8,10.130.4.243,POST /home.php HTTP/2,302,Chrome/120.0,0.584733,-1
53,10.132.2.99,POST /api/data HTTP/1.1,200,Chrome/120.0,0.576351,-1
28,10.139.3.37,POST /home.php HTTP/1.1,301,Chrome/120.0,0.575839,1
120,10.126.1.72,HEAD /cs/index.html HTTP/2,301,Googlebot/2.1,0.573039,1


In [16]:
# Save anomalies with full context
df_out = pd.concat([ids, feat, df[["anomaly_score", "is_anomaly"]]], axis=1)
df_out.to_csv("anomalies.csv", index=False)

print("✅ Anomalies saved to anomalies.csv with anomaly flag")


✅ Anomalies saved to anomalies.csv with anomaly flag
