In [1]:
import pandas as pd

# Load the CSV file directly since you're already in the /data folder
df = pd.read_csv("phishing_site_urls.csv")
 # note: updated path since we're back in project root

print(df.head())
print("\nLabel Counts:")
print(df['label'].value_counts())


                                                 url       label
0         http://login.paypal.com.session-123456.com    phishing
1        http://secure.bankofamerica.verify-user.com    phishing
2  http://update-security.microsoft.com.windows12...    phishing
3                             https://www.google.com  legitimate
4                             https://www.amazon.com  legitimate

Label Counts:
label
phishing      6
legitimate    4
Name: count, dtype: int64


In [2]:
import numpy as np

# Define function to extract features
def extract_features(url):
    return pd.Series([
        len(url),                              # Length of URL
        url.count('-'),                        # Number of hyphens
        url.count('@'),                        # Number of @ symbols
        url.count('.'),                        # Number of dots
        sum(c.isdigit() for c in url),         # Number of digits
        1 if url.startswith("https") else 0    # Uses HTTPS or not
    ])

# Apply to all URLs
df_features = df['url'].apply(extract_features)

# Rename columns
df_features.columns = ['length', 'hyphens', 'at_count', 'dots', 'digits', 'https']

# Add label back to the features
df_features['label'] = df['label']

# Show result
df_features.head()


Unnamed: 0,length,hyphens,at_count,dots,digits,https,label
0,42,1,0,4,6,0,phishing
1,43,1,0,3,0,0,phishing
2,51,1,0,4,3,0,phishing
3,22,0,0,2,0,1,legitimate
4,22,0,0,2,0,1,legitimate


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Separate features and label
X = df_features.drop('label', axis=1)
y = df_features['label']

# Convert labels to numbers (phishing=1, legitimate=0)
y = y.map({'phishing': 1, 'legitimate': 0})

# Split data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Show results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [17]:
def test_url(url):
    features = pd.DataFrame([[
        len(url),
        url.count('-'),
        url.count('@'),
        url.count('.'),
        sum(c.isdigit() for c in url),
        1 if url.startswith("https") else 0
    ]], columns=['length', 'hyphens', 'at_count', 'dots', 'digits', 'https'])

    prediction = model.predict(features)
    print("🧠 Prediction:", "Phishing" if prediction[0] == 1 else "Legitimate")

    test_url("http://secure-update-paypal.com/login")
    test_url("https://www.microsoft.com")



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Prepare data
X = df['url']
y = df['label'].map({'phishing': 1, 'legitimate': 0})

# Convert URL strings into TF-IDF vectors
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 5))  # you can tweak this later
X_tfidf = vectorizer.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train model
model_tfidf = RandomForestClassifier()
model_tfidf.fit(X_train, y_train)

# Predict and show results
y_pred = model_tfidf.predict(X_test)
print("🧠 TF-IDF Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


🧠 TF-IDF Model Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [5]:
def test_tfidf_url(url):
    # Transform the URL to match model input format
    url_tfidf = vectorizer.transform([url])
    
    # Predict
    prediction = model_tfidf.predict(url_tfidf)
    print("🤖 TF-IDF Prediction:", "Phishing" if prediction[0] == 1 else "Legitimate")

# Test it!
test_tfidf_url("http://signin-paypal-account-update.com")
test_tfidf_url("https://www.wikipedia.org")


🤖 TF-IDF Prediction: Phishing
🤖 TF-IDF Prediction: Legitimate


In [6]:
import pickle

# Save trained model and vectorizer
pickle.dump(model_tfidf, open("model_tfidf.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
