<a href="https://colab.research.google.com/github/Darshu1724/MiniProject-Generations/blob/main/MiniProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [4]:
# =============================================================================
# COMPLETE PHISHING URL DETECTOR CLASS (FIXED)
# =============================================================================
import cv2
import numpy as np
import pandas as pd
import re
import os
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from google.colab import files
import glob

class CompletePhishingDetector:
    def __init__(self):
        self.model = None
        self.vectorizer = TfidfVectorizer(max_features=100)
        self.phishing_patterns = [
            r'paypa[sl]?[-_.]*(security|login|verify|account)',
            r'secure[-_.]*(bank|paypal|amazon)',
            r'(bit\\.ly|t\\.co|tinyurl)',
            r'login[-_.]*(microsoft|facebook|amazon)',
            r'verify[-_.]*(account|payment|security)'
        ]

    def create_training_data(self):
        """‚úÖ FIXED: Creates exactly 2000 samples"""
        # 6 phishing + 5 benign = 11 URLs ‚Üí balanced multiplication
        phishing_urls = [
            "https://paypa1.com/verify", "http://secure-bank-login.com",
            "https://bit.ly/urgent-update", "https://arnazon-order.net",
            "http://192.168.1.1/admin", "https://faceb00k-security.ru"
        ]  # 6 phishing

        benign_urls = [
            "https://google.com", "https://github.com", "https://wikipedia.org",
            "https://stackoverflow.com", "https://bbc.com"
        ]  # 5 benign

        # Create exactly 1000 each (total 2000)
        phishing = phishing_urls * 167  # 6*167 = 1002
        benign = benign_urls * 200      # 5*200 = 1000

        # Trim to exact match
        urls = phishing[:1000] + benign[:1000]
        labels = ['phishing'] * 1000 + ['benign'] * 1000

        df = pd.DataFrame({'url': urls, 'label': labels})
        df.to_csv('/content/training_data.csv', index=False)
        print(f"‚úÖ Training data created: {len(df)} samples")
        print(df['label'].value_counts())
        return '/content/training_data.csv'

    def train_model(self, dataset_path=None):
        """Train ML model"""
        if dataset_path is None:
            dataset_path = self.create_training_data()

        df = pd.read_csv(dataset_path)
        print(f"üìä Dataset shape: {df.shape}")

        X = df['url'].fillna('')
        y = df['label'].map({'phishing': 1, 'benign': 0})

        print(f"Labels distribution:\n{y.value_counts()}")

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        X_train_vec = self.vectorizer.fit_transform(X_train)
        X_test_vec = self.vectorizer.transform(X_test)

        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train_vec, y_train)

        y_pred = self.model.predict(X_test_vec)
        print("\nüìà Model Accuracy:")
        print(classification_report(y_test, y_pred))

    def supercharged_ocr(self, image_path):
        """Advanced OCR for URL extraction"""
        if not os.path.exists(image_path):
            return []

        img = cv2.imread(image_path)
        if img is None:
            return []

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Multiple preprocessing
        processed = [
            gray,
            cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1],
            cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
        ]

        all_text = ""
        for proc_img in processed:
            pil_img = Image.fromarray(proc_img)
            enhanced = ImageEnhance.Contrast(pil_img.filter(ImageFilter.SHARPEN)).enhance(2.5)
            text = pytesseract.image_to_string(enhanced, config='--psm 6')
            all_text += text + " "

        # Extract URLs
        url_patterns = [
            r'https?://[^\s<>"]{8,}',
            r'www\.[^\s<>"]{8,}',
            r'\b[a-zA-Z0-9-]+\.(com|net|org|ru|info|co|ly)[^\s]*',
            r'(bit\.ly|t\.co|tinyurl)[^\s]*',
            r'(paypal|bank|amazon)[-_.]*(security|login|verify)[^\s]*'
        ]

        urls = []
        for pattern in url_patterns:
            urls.extend(re.findall(pattern, all_text, re.IGNORECASE))

        return list(set([u.strip() for u in urls if len(u) > 8]))

    def classify_url(self, url):
        """Classify single URL"""
        if self.model is None:
            score = sum(1 for pattern in self.phishing_patterns if re.search(pattern, url, re.IGNORECASE))
            return min(score / len(self.phishing_patterns), 1.0)

        try:
            url_vec = self.vectorizer.transform([url])
            prob = self.model.predict_proba(url_vec)[0]
            return prob[1] if len(prob) > 1 else 0.5
        except:
            return 0.5

    def analyze_image(self, image_path):
        """Complete analysis pipeline"""
        urls = self.supercharged_ocr(image_path)

        if not urls:
            return {
                'status': 'no_urls',
                'message': 'No URLs detected',
                'image': os.path.basename(image_path)
            }

        results = []
        spam_count = 0

        for url in urls:
            spam_prob = self.classify_url(url)
            is_spam = spam_prob > 0.6
            if is_spam:
                spam_count += 1

            results.append({
                'url': url,
                'spam_probability': round(spam_prob, 3),
                'is_spam': is_spam
            })

        return {
            'status': 'analyzed',
            'image': os.path.basename(image_path),
            'total_urls': len(urls),
            'spam_urls': spam_count,
            'has_spam': spam_count > 0,
            'urls': results
        }

# =============================================================================
# INITIALIZE & TRAIN (RUN THIS)
# =============================================================================
detector = CompletePhishingDetector()
detector.train_model()
print("üöÄ Detector ready!")


‚úÖ Training data created: 2000 samples
label
phishing    1000
benign      1000
Name: count, dtype: int64
üìä Dataset shape: (2000, 2)
Labels distribution:
label
1    1000
0    1000
Name: count, dtype: int64

üìà Model Accuracy:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       201
           1       1.00      1.00      1.00       199

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

üöÄ Detector ready!


In [10]:
# =============================================================================
# STEP 3: UPLOAD DATASET (3 OPTIONS)
# =============================================================================
print("Choose ONE method to upload your images:")

def upload_and_analyze():
    print("\nüì§ OPTION 1: Direct Upload (Recommended)")
    print("Run this:")
    uploaded = files.upload()

    print("\nüì§ OPTION 2: Google Drive (Large datasets)")
    print("Run this:")
    print("from google.colab import drive")
    print("drive.mount('/content/drive')")

    print("\nüì§ OPTION 3: ZIP Upload")
    print("zip your folder ‚Üí upload ‚Üí run:")
    print("!unzip your_dataset.zip -d /content/dataset/")

    return list(uploaded.keys())

# Quick upload
uploaded_files = upload_and_analyze()


Choose ONE method to upload your images:

üì§ OPTION 1: Direct Upload (Recommended)
Run this:


Saving spam9.jpg to spam9.jpg
Saving spam8.jpg to spam8.jpg
Saving spam7.jpg to spam7.jpg
Saving spam6.jpg to spam6.jpg

üì§ OPTION 2: Google Drive (Large datasets)
Run this:
from google.colab import drive
drive.mount('/content/drive')

üì§ OPTION 3: ZIP Upload
zip your folder ‚Üí upload ‚Üí run:
!unzip your_dataset.zip -d /content/dataset/


In [11]:
# =============================================================================
# STEP 4: BATCH PROCESS ALL IMAGES
# =============================================================================
def process_dataset(image_paths):
    """Process all uploaded images"""
    results = []

    for i, img_path in enumerate(image_paths):
        print(f"\nüîç [{i+1}/{len(image_paths)}] {os.path.basename(img_path)}")
        result = detector.analyze_image(img_path)
        results.append(result)

        if result['status'] == 'analyzed':

            print(f"   ‚úÖ Found {result['total_urls']} URLs")
            if result['has_spam']:
                print(f"   üö® SPAM DETECTED: {result['spam_urls']}/{result['total_urls']}")
                for url in result['urls'][:2]:
                    if url['is_spam']:
                        print(f"      ‚Üí {url['url']} ({url['spam_probability']})")
            print()

    # Create summary table
    df = pd.DataFrame([{
        'image': r['image'],
        'status': r['status'],
        'urls_found': r.get('total_urls', 0),
        'spam_detected': r.get('has_spam', False),
        'spam_count': r.get('spam_urls', 0)
    } for r in results])

    print("üìä FINAL SUMMARY")
    print("="*60)
    print(df.to_string(index=False))

    # Save results
    df.to_csv('/content/phishing_detection_results.csv', index=False)
    files.download('/content/phishing_detection_results.csv')

    return df

# AUTO-DETECT UPLOADED IMAGES
image_files = [f for f in os.listdir('.') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
if image_files:
    print(f"üéØ Found {len(image_files)} images: {image_files[:3]}")
    results_df = process_dataset(image_files)
else:
    print("‚ùå No images found. Upload using STEP 3 first!")


üéØ Found 9 images: ['spam3.jpg', 'spam1.jpg', 'spam7.jpg']

üîç [1/9] spam3.jpg

üîç [2/9] spam1.jpg
   ‚úÖ Found 2 URLs
   üö® SPAM DETECTED: 1/2
      ‚Üí https://bit.ly/3ZR4H3P (0.89)


üîç [3/9] spam7.jpg
   ‚úÖ Found 4 URLs


üîç [4/9] spam5.jpg
   ‚úÖ Found 2 URLs


üîç [5/9] spam9.jpg
   ‚úÖ Found 3 URLs


üîç [6/9] spam8.jpg
   ‚úÖ Found 3 URLs


üîç [7/9] spam2.jpg

üîç [8/9] spam6.jpg
   ‚úÖ Found 4 URLs


üîç [9/9] spam4.jpg
   ‚úÖ Found 2 URLs

üìä FINAL SUMMARY
    image   status  urls_found  spam_detected  spam_count
spam3.jpg  no_urls           0          False           0
spam1.jpg analyzed           2           True           1
spam7.jpg analyzed           4          False           0
spam5.jpg analyzed           2          False           0
spam9.jpg analyzed           3          False           0
spam8.jpg analyzed           3          False           0
spam2.jpg  no_urls           0          False           0
spam6.jpg analyzed           4          Fals

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>