In [None]:
import zipfile
import os

zip_path = "/content/phishing datasets.zip"
extract_to = "/content/dataset"

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

# List all files
for root, dirs, files in os.walk(extract_to):
    for file in files:
        print(os.path.join(root, file))



/content/dataset/phishing datasets/new_data_urls.csv
/content/dataset/phishing datasets/Phishing_Email.csv
/content/dataset/phishing datasets/Http_extract.py
/content/dataset/phishing datasets/data_conversion.pdf
/content/dataset/phishing datasets/Email_extract.py
/content/dataset/phishing datasets/Phishing URLs.csv
/content/dataset/phishing datasets/Phone_extract.py
/content/dataset/phishing datasets/Dataset_5971.csv
/content/dataset/phishing datasets/data_visual.pdf
/content/dataset/phishing datasets/Frequency.pdf


In [None]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv("/content/dataset/phishing datasets/new_data_urls.csv")

# Preview the data
print("Dataset shape:", df.shape)
print("Columns:", df.columns)
df.head()


Dataset shape: (822010, 2)
Columns: Index(['url', 'status'], dtype='object')


Unnamed: 0,url,status
0,0000111servicehelpdesk.godaddysites.com,0
1,000011accesswebform.godaddysites.com,0
2,00003.online,0
3,0009servicedeskowa.godaddysites.com,0
4,000n38p.wcomhost.com,0


In [None]:
# Check for missing values
print(df.isnull().sum())

# Rename columns if needed (adjust as per your dataset)
df.columns = ['url', 'label']  # if your file already has proper headers, skip this

# Encode label (if needed)
df['label'] = df['label'].map({'phishing':1, 'benign':0, 'malicious':1, 'legitimate':0, '1':1, '0':0}).fillna(df['label'])
df['label'] = df['label'].astype(int)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['url'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



url       0
status    0
dtype: int64
Accuracy: 0.8870877483242297
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.94      0.89     79122
           1       0.94      0.84      0.89     85280

    accuracy                           0.89    164402
   macro avg       0.89      0.89      0.89    164402
weighted avg       0.89      0.89      0.89    164402



In [None]:
def predict_url_risk(url):
    vec = vectorizer.transform([url])
    pred = model.predict(vec)[0]
    return "⚠️ Phishing" if pred == 1 else "✅ Safe"

# Example
print(predict_url_risk("http://amazon-prime.membership-billing-check.com"))


✅ Safe
