In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC # Using SVC for RBF Kernel
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import urllib.parse 

# --- Data Loading and Preprocessing ---
# NOTE: Ensure 'urldata.csv' and 'malicious_phish.csv' are in the correct path.
data1 = pd.read_csv('urldata.csv')
data2 = pd.read_csv('malicious_phish.csv')

# Map types to binary result
mapping = {'phishing' : 'malicious', 'defacement' : 'malicious','malware' : 'malicious',}
data2['type'] = data2['type'].replace(mapping)
mapping2 = {'malicious': 1, 'benign': 0}
data2['result'] = data2['type'].replace(mapping2).astype(int)

# Drop unused columns and combine
data1.drop(['Unnamed: 0', 'label'],axis=1, inplace=True)
data2.drop(['type'],axis=1, inplace=True)
data = pd.concat([data1, data2],ignore_index=True)
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.reset_index(inplace=True,drop=True)

# Under-sampling to balance classes
rus = RandomUnderSampler(random_state=42)
x_url = data[['url']]
y = data['result']
x_resampled, y_resampled = rus.fit_resample(x_url,y)
data = pd.concat([x_resampled,y_resampled],axis=1)
data.reset_index(inplace=True,drop=True)

# --- Feature Engineering (4 Features) ---
data['char'] = data['url'].str.len()

def count_query_params(url):
    try:
        query_string = urllib.parse.urlparse(url).query
        if not query_string:
            return 0
        return len(query_string.split('&'))
    except:
        return 0
data['queries'] = data['url'].apply(count_query_params).astype(int)

def count_subdomains(url):
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        netloc = urllib.parse.urlparse(url).netloc
        hostname = netloc.split(':')[0]
        return hostname.count('.')
    except:
        return 0
data['num_subdomains'] = data['url'].apply(count_subdomains)

data['has_at_symbol'] = data['url'].apply(lambda x: 1 if '@' in x else 0)
print('Finished Data Prep. Data Head:')
print(data.head())


  data2['result'] = data2['type'].replace(mapping2).astype(int)


Finished Data Prep. Data Head:
                                                 url  result  char  queries  \
0  https://www.amazon.com/Punch-Vincent-Gale/dp/B...       0    55        0   
1  https://www.startreklinks.net/series-movies/vo...       0    56        0   
2  http://torcache.net/torrent/00611B9CA7EDC70114...       0   109        1   
3  https://www.thirdworldtraveler.com/American_Em...       0    76        0   
4        syndicalist.org/archives/llr14-24/14f.shtml       0    43        0   

   num_subdomains  has_at_symbol  
0               2              0  
1               2              0  
2               1              0  
3               2              0  
4               1              0  


In [2]:
# --- Train/Val/Test Split ---
x = data[['char', 'queries', 'num_subdomains', 'has_at_symbol']]
y = data['result']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f"Train samples: {len(x_train)}, Validation samples: {len(x_val)}, Test samples: {len(x_test)}")


Train samples: 506254, Validation samples: 63282, Test samples: 63282


In [3]:
# --- Scaling (Essential for RBF Kernel) ---
scaler = StandardScaler()

# 1. Fit and transform ONLY the training data
x_train_scaled = scaler.fit_transform(x_train)
# 2. ONLY transform the validation and test data
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)
print('Data Scaled.')


Data Scaled.


In [4]:
# --- Model Training: SVC with RBF Kernel ---
# NOTE: RBF kernels are sensitive to C and gamma. Grid Search is recommended.
# C=1.0 and gamma='scale' are used here as a starting point.
rbf_svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
rbf_svm.fit(X=x_train_scaled, y=y_train)
print('RBF Model Trained.')


RBF Model Trained.


In [5]:
# --- Evaluation ---
print("\n--- Model Evaluation (SVC RBF Kernel) ---")

# Validation Set Evaluation
val_pred = rbf_svm.predict(x_val_scaled)
print("\nValidation Results:")
print("Accuracy:", accuracy_score(y_val, val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_pred))
print("Classification Report:\n", classification_report(y_val, val_pred))

# Test Set Evaluation
test_pred = rbf_svm.predict(x_test_scaled)
print("\nTest Results:")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))
print("Classification Report:\n", classification_report(y_test, test_pred))



--- Model Evaluation (SVC RBF Kernel) ---

Validation Results:
Accuracy: 0.6185329161530925
Confusion Matrix:
 [[26693  4948]
 [19192 12449]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.84      0.69     31641
           1       0.72      0.39      0.51     31641

    accuracy                           0.62     63282
   macro avg       0.65      0.62      0.60     63282
weighted avg       0.65      0.62      0.60     63282


Test Results:
Accuracy: 0.6180114408520591
Confusion Matrix:
 [[26692  4949]
 [19224 12417]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.84      0.69     31641
           1       0.72      0.39      0.51     31641

    accuracy                           0.62     63282
   macro avg       0.65      0.62      0.60     63282
weighted avg       0.65      0.62      0.60     63282

