In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv('/content/new_data_urls.csv')  # Path to your dataset
df = df[['url', 'status']]  # Keep relevant columns
df = df.dropna()

In [None]:
def extract_features(url):
    features = {
        'url_length': len(url),
        'num_dots': url.count('.'),
        'num_hyphens': url.count('-'),
        'has_https': int('https' in url),
        'has_at': int('@' in url),
        'has_ip': int(bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))),
        'num_slashes': url.count('/'),
        'has_login': int('login' in url.lower()),
        'has_secure': int('secure' in url.lower()),
        'has_bank': int('bank' in url.lower()),
    }
    return pd.Series(features)

In [None]:
features_df = df['url'].apply(extract_features)
features_df['label'] = df['status']  # 0 = safe, 1 = phishing


In [None]:
#Train-test split
X = features_df.drop('label', axis=1)
y = features_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8685113319789297
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.80      0.85     79122
           1       0.83      0.93      0.88     85280

    accuracy                           0.87    164402
   macro avg       0.88      0.87      0.87    164402
weighted avg       0.87      0.87      0.87    164402



In [None]:
def predict_url_risk(url):
    feats = extract_features(url).values.reshape(1, -1)
    prediction = model.predict(feats)[0]
    return "Phishing" if prediction == 1 else "Safe"

# Example
test_urls = [
    "https://secure-login.bankofamerica.com",
    "http://facebook.com-security-alerts.ru",
    "https://google.com",
    "http://update.youraccount-verify.net",
    "http://facebook.com-security-alert.ru/login",
    "http://apple.com-reset-password.ga",
    "http://verify-instagram-support.com"
]

for url in test_urls:
    print(f"{url} → {predict_url_risk(url)}")


https://secure-login.bankofamerica.com → Safe
http://facebook.com-security-alerts.ru → Safe
https://google.com → Safe
http://update.youraccount-verify.net → Phishing
http://facebook.com-security-alert.ru/login → Safe
http://apple.com-reset-password.ga → Phishing
http://verify-instagram-support.com → Phishing




In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'phishing_model.pkl')




['phishing_model.pkl']

In [None]:
from google.colab import files
files.download('phishing_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# prompt: download vectorizer

from sklearn.feature_extraction.text import CountVectorizer
# Train a CountVectorizer on the URLs
vectorizer = CountVectorizer()
vectorizer.fit(df['url'])
# Save the trained vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
# Download the vectorizer file
files.download('vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>