In [17]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [18]:
DATASET_PATH = 'dataset.parquet'

try:
    df = pd.read_parquet(DATASET_PATH)
    print(f"Success. Total Database Size: {len(df)} records.")
except FileNotFoundError:
    print("Error: Dataset not found.")

Success. Total Database Size: 11430 records.


In [19]:
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,https://www.todayshomeowner.com/how-to-make-ho...,82,23,0,2,7,0,0,0,0,...,1,1,0,240,8892,67860,0,1,4,legitimate
1,http://thapthan.ac.th/information/confirmation...,93,14,1,2,0,0,0,0,0,...,1,0,1,0,2996,4189860,0,1,2,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,121,21,1,3,0,0,0,0,0,...,1,1,0,30,2527,346022,0,1,3,phishing
3,https://www.bedslide.com,24,16,0,2,0,0,0,0,0,...,0,0,0,139,7531,1059151,0,0,4,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,73,24,0,3,1,0,0,0,0,...,0,0,0,3002,7590,635,0,1,5,legitimate


In [20]:
FEATURES = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 
    'nb_at', 'nb_qm', 'nb_and', 'nb_eq', 'nb_underscore', 'nb_tilde', 
    'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 
    'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 
    'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 
    'ratio_digits_host', 'prefix_suffix', 'shortening_service', 
    'iframe', 'nb_hyperlinks', 'empty_title'
]
TARGET = 'status'

In [21]:
X = df[FEATURES]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)
rf_model.fit(X_train, y_train)

In [22]:
preds = rf_model.predict(X_test)
acc = accuracy_score(y_test, preds)

print("-" * 30)
print(f"FINAL ACCURACY: {acc*100:.2f}%")
print("-" * 30)

with open('testmodel.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

print(f"Model saved successfully to: {MODEL_PATH}")

FINAL ACCURACY: 90.70%
