In [11]:
import json
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [15]:
def load_data(base_path):
    rows, labels = [], []
    for label, cls in enumerate(["clean", "malware"]):
        for fpath in tqdm(glob.glob(f"{base_path}/{cls}/*.jsonl")):
            with open(fpath, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        obj = json.loads(line)
                        rqs = obj[2].get('rqs', {})
                        rsp = obj[2].get('rsp', {})
                        text = " ".join([
                            " ".join([f"{k}:{v}" for k, v in rqs.items()]),
                            " ".join([f"{k}:{v}" for k, v in rsp.items()])
                        ])
                        rows.append(text)
                        labels.append(label)
                    except Exception:
                        continue
    return pd.DataFrame({"text": rows, "label": labels})

def predict_file(file_path, vectorizer, model):
    preds = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            obj = json.loads(line)
            rqs = obj[2].get('rqs', {})
            rsp = obj[2].get('rsp', {})
            text = " ".join([
                " ".join([f"{k}:{v}" for k, v in rqs.items()]),
                " ".join([f"{k}:{v}" for k, v in rsp.items()])
            ])
            X = vectorizer.transform([text])
            preds.append(model.predict(X)[0])
    return int(any(pred == 1 for pred in preds))

In [9]:
df = load_data("/kaggle/input/http-malware-detection/train")

train_texts, val_texts, y_train, y_val = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 3),
        analyzer='char_wb',  # helps catch obfuscated strings (e.g., %20, hex)
        lowercase=True,
        min_df=2
    )
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

100%|██████████| 3419/3419 [00:02<00:00, 1371.86it/s]
100%|██████████| 1016/1016 [00:00<00:00, 1312.92it/s]


In [12]:
 # Train XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

In [17]:
# Evaluate
y_pred = xgb_model.predict(X_val)

print("Validation F1:", f1_score(y_val, y_pred))

test = glob.glob("/kaggle/input/http-malware-detection/test/*.jsonl")

result = []

for path in test:
    predict = predict_file(path, vectorizer, xgb_model)
    result.append({'id': Path(path).stem, 'target': predict})

result_df = pd.DataFrame(result)

result_df.to_csv('/kaggle/working/submission.csv', index=False)

Validation F1: 0.9070512820512822


In [2]:
import json
import glob
import os
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import xgboost as xgb
import numpy as np
import re
from urllib.parse import urlparse, parse_qs
# ----------------------------
# Helper: Compute text entropy (for obfuscated URLs)
# ----------------------------
def entropy(s):
    if not s:
        return 0
    s = str(s)
    prob = [float(s.count(c)) / len(s) for c in set(s)]
    entropy = -sum(p * np.log2(p) for p in prob)
    return entropy

# ----------------------------
# Extract features from a single transaction
# ----------------------------
def extract_features(tx):
    rqs = tx.get('rqs', {})
    rsp = tx.get('rsp', {})
    
    feats = {}
    
    # Request features
    url = rqs.get('url', '')
    parsed_url = urlparse(url)
    feats['url_len'] = len(url)
    feats['url_entropy'] = entropy(parsed_url.path + parsed_url.query)
    feats['num_params'] = len(parse_qs(parsed_url.query))
    feats['path_depth'] = len(parsed_url.path.split('/')) - 1
    feats['method'] = rqs.get('method', 'unknown')
    feats['host'] = rqs.get('host', 'unknown')
    feats['user_agent'] = rqs.get('user-agent', 'unknown')
    
    # Response features
    feats['status'] = rsp.get('code', -1)
    feats['content_type'] = rsp.get('content-type', 'unknown')
    
    # Suspicious flags
    feats['has_hex_in_url'] = 1 if re.search(r'%[0-9A-Fa-f]{2}', url) else 0
    feats['is_https'] = 1 if url.startswith('https') else 0
    feats['is_http'] = 1 if url.startswith('http://') else 0
    
    # Combine for TF-IDF
    feats['text'] = f"{parsed_url.path} {parsed_url.query} {rqs.get('user-agent', '')} {rsp.get('content-type', '')}".lower()
    
    return feats

# ----------------------------
# Load data and extract features per transaction
# ----------------------------
def load_data_with_features(base_path):
    rows = []
    labels = []
    
    for label, cls in enumerate(["clean", "malware"]):
        cls_path = os.path.join(base_path, cls)
        if not os.path.exists(cls_path):
            continue
        for fpath in tqdm(glob.glob(f"{cls_path}/*.jsonl"), desc=f"Loading {cls}"):
            with open(fpath, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    try:
                        obj = json.loads(line.strip())
                        if isinstance(obj, list) and len(obj) >= 3:
                            # The transaction is the third element (index 2)
                            tx_obj = obj[2]
                            if isinstance(tx_obj, dict) and ('rqs' in tx_obj or 'rsp' in tx_obj):
                                feats = extract_features(tx_obj)
                                if feats['url_len'] > 0:  # skip if no URL
                                    rows.append(feats)
                                    labels.append(label)
                    except Exception as e:
                        # print(f"Skipping line due to error: {e}")  # Uncomment to debug
                        continue
    return pd.DataFrame(rows), np.array(labels)

# ----------------------------
# Aggregate features per file for prediction
# ----------------------------
def aggregate_file_features(file_path, vectorizer, tfidf_dim):
    df_transactions = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                obj = json.loads(line.strip())
                if isinstance(obj, list) and len(obj) >= 3:
                    tx_obj = obj[2]
                    if isinstance(tx_obj, dict) and ('rqs' in tx_obj or 'rsp' in tx_obj):
                        feats = extract_features(tx_obj)
                        if feats['url_len'] > 0:
                            df_transactions.append(feats)
            except:
                continue

    if not df_transactions:
        # Return default features if no transactions
        return np.zeros(tfidf_dim + 10)  # Adjust 10 to match number of custom features

    df = pd.DataFrame(df_transactions)
    
    # TF-IDF features (average across all transactions)
    text_combined = " ".join(df['text'].fillna('').values)
    tfidf_vec = vectorizer.transform([text_combined]).toarray().flatten()
    
    # Aggregated numerical features
    num_features = [
        df['url_len'].mean() if 'url_len' in df else 0,
        df['url_entropy'].mean() if 'url_entropy' in df else 0,
        df['num_params'].mean() if 'num_params' in df else 0,
        df['path_depth'].mean() if 'path_depth' in df else 0,
        df['status'].mode().iloc[0] if 'status' in df and len(df['status']) > 0 else -1,
        df['has_hex_in_url'].sum(),
        df['is_https'].sum(),
        df['is_http'].sum(),
        len(df),  # total transactions
        df['host'].nunique()  # unique hosts
    ]
    
    return np.concatenate([tfidf_vec, num_features])

# ----------------------------
# Main pipeline
# ----------------------------

# Load data
df, y = load_data_with_features("/kaggle/input/http-malware-detection/train")
print(f"Loaded {len(df)} transactions | Malware ratio: {y.mean():.3f}")

Loading clean: 100%|██████████| 3419/3419 [00:03<00:00, 943.82it/s]
Loading malware: 100%|██████████| 1016/1016 [00:01<00:00, 949.17it/s]

Loaded 13099 transactions | Malware ratio: 0.376





In [3]:
# Prepare TF-IDF on 'text' column
vectorizer = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 3),
    analyzer='char_wb',
    lowercase=True,
    min_df=2
)
tfidf_matrix = vectorizer.fit_transform(df['text'].fillna('').values)
tfidf_dim = tfidf_matrix.shape[1]

# Encode categorical features
le_method = LabelEncoder()
df['method_enc'] = le_method.fit_transform(df['method'].fillna('unknown'))

le_content_type = LabelEncoder()
df['content_type_enc'] = le_content_type.fit_transform(df['content_type'].fillna('unknown'))

# Combine TF-IDF + numerical + encoded features
X_num = df[['url_len', 'url_entropy', 'num_params', 'path_depth', 'status', 'has_hex_in_url', 'is_https', 'is_http']].fillna(0).values
X_cat = df[['method_enc', 'content_type_enc']].values
X_combined = np.hstack([tfidf_matrix.toarray(), X_num, X_cat])

In [4]:
# Split
X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

# Train XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

In [6]:
# Evaluate
y_pred = xgb_model.predict(X_val)
print("Validation F1:", f1_score(y_val, y_pred))

# Predict test set
test_files = glob.glob("/kaggle/input/http-malware-detection/test/*.jsonl")
results = []
for fpath in tqdm(test_files, desc="Predicting test"):
    x_file = aggregate_file_features(fpath, vectorizer, tfidf_dim)
    # Reshape to match training shape
    x_file = x_file.reshape(1, -1)
    pred = xgb_model.predict(x_file)[0]
    results.append({'id': Path(fpath).stem, 'target': int(pred)})

# Save
result_df = pd.DataFrame(results)
result_df.to_csv('submission_xgb.csv', index=False)
print("Submission saved.")

Validation F1: 0.8866200967221923


Predicting test: 100%|██████████| 1326/1326 [00:15<00:00, 85.41it/s]

Submission saved.



