In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

df = pd.read_csv('./url_features.csv')

df.head()


Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,...,Mouse_Over,Right_Click,Web_Forwards,Num_Dots,HTTPS_Token,Suspicious_Words,Content_Length,Num_Images,Num_Scripts,Label
0,0,0,36,0,0,0,0,1,1,0,...,0,0,0,2,0,0,1123,0,1,1
1,0,0,48,2,0,1,0,1,1,0,...,0,0,0,2,0,0,450,0,3,1
2,0,0,175,5,0,1,0,0,1,0,...,0,0,0,2,0,0,127,0,18,1
3,0,0,32,1,0,1,0,0,1,0,...,0,0,0,2,0,0,450,0,3,1
4,0,0,36,1,0,1,0,0,1,0,...,0,0,0,1,0,0,1257,0,0,1


In [4]:
df.dropna(inplace=True)

X = df.drop(columns=['Label'])
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [5]:
svm = SVC()
svm.fit(X_train, y_train)
joblib.dump(svm, 'svm_model.pkl')

lr = LogisticRegression()
lr.fit(X_train, y_train)
joblib.dump(lr, 'lr_model.pkl')

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
joblib.dump(dt, 'dt_model.pkl')

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
joblib.dump(rf, 'rf_model.pkl')

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
joblib.dump(xgb, 'xgb_model.pkl')






['xgb_model.pkl']

In [6]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

models = {
    "Support Vector Machine": joblib.load('svm_model.pkl'),
    "Logistic Regression": joblib.load('lr_model.pkl'),
    "Decision Tree": joblib.load('dt_model.pkl'),
    "Random Forest": joblib.load('rf_model.pkl'),
    "XGBoost": joblib.load('xgb_model.pkl')
}

for name, model in models.items():
    print(f"Evaluating {name}")
    evaluate_model(model, X_test, y_test)
    print("\n" + "="*50 + "\n")


Evaluating Support Vector Machine
Accuracy: 0.996
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       988
           1       1.00      1.00      1.00      1012

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



Evaluating Logistic Regression
Accuracy: 0.961
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       988
           1       0.95      0.97      0.96      1012

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



Evaluating Decision Tree
Accuracy: 0.9985
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       988
           1       1.00      1.00      1.00      1012

    accuracy                           