In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import urllib.parse
import math # Required for entropy calculation
from collections import Counter # Required for entropy calculation

# --- Entropy Calculation Function ---
def calculate_entropy(s):
    """Calculates the Shannon entropy of a string."""
    if not s:
        return 0
    # Count character frequencies
    counts = Counter(s)
    # Calculate probabilities
    probabilities = [float(count) / len(s) for count in counts.values()]
    # Calculate entropy: H = -sum(p * log2(p))
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy

# --- Data Loading and Preprocessing ---
data1 = pd.read_csv('urldata.csv')
data2 = pd.read_csv('malicious_phish.csv')

# Map types to binary result
mapping = {'phishing' : 'malicious', 'defacement' : 'malicious','malware' : 'malicious',}
data2['type'] = data2['type'].replace(mapping)
mapping2 = {'malicious': 1, 'benign': 0}
data2['result'] = data2['type'].replace(mapping2).astype(int)

# Drop unused columns and combine
data1.drop(['Unnamed: 0', 'label'],axis=1, inplace=True)
data2.drop(['type'],axis=1, inplace=True)
data = pd.concat([data1, data2],ignore_index=True)
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.reset_index(inplace=True,drop=True)

# Under-sampling to balance classes
rus = RandomUnderSampler(random_state=42)
x_url = data[['url']]
y = data['result']
x_resampled, y_resampled = rus.fit_resample(x_url,y)
data = pd.concat([x_resampled,y_resampled],axis=1)
data.reset_index(inplace=True,drop=True)

# --- Feature Engineering (4 Features: char, queries, num_subdomains, entropy) ---
data['char'] = data['url'].str.len()

def count_query_params(url):
    try:
        query_string = urllib.parse.urlparse(url).query
        if not query_string:
            return 0
        return len(query_string.split('&'))
    except:
        return 0
data['queries'] = data['url'].apply(count_query_params).astype(int)

def count_subdomains(url):
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        netloc = urllib.parse.urlparse(url).netloc
        hostname = netloc.split(':')[0]
        return hostname.count('.')
    except:
        return 0
data['num_subdomains'] = data['url'].apply(count_subdomains)

# NEW FEATURE: Entropy of the URL string (replaces 'has_at_symbol')
data['entropy'] = data['url'].apply(calculate_entropy)

print('Finished Data Prep. Data Head:')
print(data.head())


  data2['result'] = data2['type'].replace(mapping2).astype(int)


Finished Data Prep. Data Head:
                                                 url  result  char  queries  \
0  https://www.amazon.com/Punch-Vincent-Gale/dp/B...       0    55        0   
1  https://www.startreklinks.net/series-movies/vo...       0    56        0   
2  http://torcache.net/torrent/00611B9CA7EDC70114...       0   109        1   
3  https://www.thirdworldtraveler.com/American_Em...       0    76        0   
4        syndicalist.org/archives/llr14-24/14f.shtml       0    43        0   

   num_subdomains   entropy  
0               2  4.738013  
1               2  4.150319  
2               1  4.876598  
3               2  4.298520  
4               1  4.332787  


In [2]:
# --- Train/Val/Test Split ---
# Feature list updated to include 'entropy'
x = data[['char', 'queries', 'num_subdomains', 'entropy']]
y = data['result']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f"Train samples: {len(x_train)}, Validation samples: {len(x_val)}, Test samples: {len(x_test)}")


Train samples: 506254, Validation samples: 63282, Test samples: 63282


In [3]:
# --- Scaling (Essential for RBF Kernel) ---
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)
print('Data Scaled.')


Data Scaled.


In [4]:
# --- Model Training: SVC RBF Kernel Hyperparameter Tuning (C values) ---

C_values = [0.1, 1.0, 10.0, 100.0] 
print("\n--- SVC RBF Kernel Hyperparameter Tuning (C values) ---")

best_macro_f1 = 0
best_c = 0

for C_val in C_values:
    # Train the RBF Model with the current C value
    rbf_svm = SVC(kernel='rbf', C=C_val, gamma='scale', random_state=42)
    rbf_svm.fit(X=x_train_scaled, y=y_train)

    # Predict on Validation Set
    val_pred = rbf_svm.predict(x_val_scaled)
    
    # Calculate and print key metrics
    val_accuracy = accuracy_score(y_val, val_pred)
    report = classification_report(y_val, val_pred, output_dict=True)
    val_macro_f1 = report['macro avg']['f1-score']
    
    print(f"\nResults for C = {C_val}:")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")
    print(f"  Validation Macro F1: {val_macro_f1:.4f}")
    print("---------------------------------------")
    
    if val_macro_f1 > best_macro_f1:
        best_macro_f1 = val_macro_f1
        best_c = C_val

print(f"\nBest C value (by Validation Macro F1): {best_c}")



--- SVC RBF Kernel Hyperparameter Tuning (C values) ---

Results for C = 0.1:
  Validation Accuracy: 0.6252
  Validation Macro F1: 0.6145
---------------------------------------

Results for C = 1.0:
  Validation Accuracy: 0.6297
  Validation Macro F1: 0.6199
---------------------------------------

Results for C = 10.0:
  Validation Accuracy: 0.6328
  Validation Macro F1: 0.6248
---------------------------------------

Results for C = 100.0:
  Validation Accuracy: 0.6355
  Validation Macro F1: 0.6296
---------------------------------------

Best C value (by Validation Macro F1): 100.0


In [None]:
# --- Final Evaluation (Run this after choosing the Best C) ---
# Replace best_c_value with the value determined in the cell above.
best_c_value = 1.0 # Placeholder: Replace with the actual best C from the tuning loop

final_rbf_svm = SVC(kernel='rbf', C=best_c_value, gamma='scale', random_state=42)
final_rbf_svm.fit(X=x_train_scaled, y=y_train)

# Test Set Evaluation (Use this only for your final report)
test_pred = final_rbf_svm.predict(x_test_scaled)
print("\n--- FINAL TEST RESULTS (Optimal C) ---")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))
print("Classification Report:\n", classification_report(y_test, test_pred))
