In [13]:
# ====================================================
# WhatsApp Scam Detection - Baseline Model
# ====================================================

# 1. Imports
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 2. Load dataset
df = pd.read_csv("/kaggle/input/whatsapp-scam/whatsapp_scam_dataset.csv")

print("Dataset shape:", df.shape)
print(df.head())

# 3. Define features & labels
X = df["message"]                # input text
y = df["scam_type"]              # multiclass labels (phishing, fake loan, etc.)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Text vectorization
vectorizer = TfidfVectorizer(
    stop_words="english", 
    max_features=5000,    # keep it light
    ngram_range=(1,2)     # unigrams + bigrams
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 6. Train model
model = LogisticRegression(max_iter=200, class_weight="balanced")
model.fit(X_train_vec, y_train)

# 7. Evaluate
y_pred = model.predict(X_test_vec)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 8. Test with new messages
sample_msgs = [
    "Click here to win 10000 instantly"
]

sample_vec = vectorizer.transform(sample_msgs)
predictions = model.predict(sample_vec)

for msg, pred in zip(sample_msgs, predictions):
    print(f"\nMessage: {msg}\nPredicted Scam Type: {pred}")

# ====================================================
# 9. Save the model and vectorizer
# ====================================================
joblib.dump(model, "scam_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("\n✅ Model and vectorizer saved successfully!")


Dataset shape: (10000, 7)
   id                      scam_type  \
0   1   Phishing Scam (Link Sharing)   
1   2      Fake Discount/Refund Scam   
2   3        Fake Loan Approval Scam   
3   4  WhatsApp Account Hacking Scam   
4   5           Fake E-commerce Scam   

                                             message  \
0  Your Axis account is at risk. Click here to ve...   
1  You are eligible for a ₹53597 refund from Chad...   
2  You are pre-approved for a ₹77178 loan. Pay ₹6...   
3  Hey, this is Bhamini. I accidentally sent my O...   
4  Get Smartwatch for just ₹82437. DM on WhatsApp...   

                                         description language  \
0  Phishing links mimic legitimate websites and t...  English   
1  Scammers impersonate companies and request ban...  English   
2  Scammers offer fake loans and ask for an upfro...  English   
3  A scammer pretends to be a friend and tricks v...  English   
4  Fraudsters advertise too-good-to-be-true deals...  English   

    t

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predictions
y_pred = model.predict(X_test_vec)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {acc:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Accuracy: 1.00

Classification Report:
                                precision    recall  f1-score   support

Cryptocurrency Investment Scam       1.00      1.00      1.00       147
    Fake Charity/Donation Scam       1.00      1.00      1.00       142
     Fake Discount/Refund Scam       1.00      1.00      1.00       137
          Fake E-commerce Scam       1.00      1.00      1.00       149
           Fake Job Offer Scam       1.00      1.00      1.00       149
       Fake Loan Approval Scam       1.00      1.00      1.00       138
   Fake Technical Support Scam       1.00      1.00      1.00       142
       Friend in Distress Scam       1.00      1.00      1.00       133
  Phishing Scam (Link Sharing)       1.00      1.00      1.00       144
     SIM Card Replacement Scam       1.00      1.00      1.00       149
               Tax Refund Scam       1.00      1.00      1.00       133
                      UPI Scam       1.00      1.00      1.00       145
 WhatsApp Account

In [8]:
!pip install python-whois pyzbar

Collecting pyzbar
  Downloading pyzbar-0.1.9-py2.py3-none-any.whl.metadata (10 kB)
Downloading pyzbar-0.1.9-py2.py3-none-any.whl (32 kB)
Installing collected packages: pyzbar
Successfully installed pyzbar-0.1.9


In [15]:
import re
import socket
import whois
import datetime
from urllib.parse import urlparse

# -------------------------------
# 1. URL Extraction Helper
# -------------------------------
def extract_url(text):
    url_pattern = r"(https?://\S+|www\.\S+)"
    urls = re.findall(url_pattern, text)
    return urls[0] if urls else None

# -------------------------------
# 2. URL Intelligence Module
# -------------------------------
def check_url_risk(url):
    score = 0
    reasons = []
    parsed = urlparse(url)
    
    # Rule 1: Check HTTPS
    if parsed.scheme != "https":
        score += 30
        reasons.append("No HTTPS detected")
    
    # Rule 2: Suspicious domain (too long or has numbers)
    domain = parsed.netloc
    if any(char.isdigit() for char in domain) or len(domain) > 30:
        score += 20
        reasons.append("Suspicious domain name")
    
    # Rule 3: Domain age
    try:
        domain_info = whois.whois(domain)
        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date
        
        if creation_date:
            age_days = (datetime.datetime.now() - creation_date).days
            if age_days < 180:  # less than 6 months old
                score += 30
                reasons.append("Newly registered domain")
    except Exception as e:
        score += 10
        reasons.append("Domain info unavailable")
    
    # Final decision
    risk_level = "Low"
    if score >= 60:
        risk_level = "High"
    elif score >= 30:
        risk_level = "Medium"
    
    return {
        "risk_score": score,
        "risk_level": risk_level,
        "reasons": reasons
    }

# -------------------------------
# 3. Hybrid Fraud Detection
# -------------------------------
def analyze_message(text, model, vectorizer):
    url = extract_url(text)
    
    if url:  # Case 1: URL present
        url_result = check_url_risk(url)
        return {
            "message": text,
            "type": "URL Analysis",
            "url": url,
            "risk_score": url_result["risk_score"],
            "risk_level": url_result["risk_level"],
            "details": url_result["reasons"]
        }
    
    else:  # Case 2: No URL → run NLP scam classifier
        vec = vectorizer.transform([text])
        pred = model.predict(vec)[0]
        prob = model.predict_proba(vec).max() * 100
        
        return {
            "message": text,
            "type": "Text Classification",
            "predicted_label": pred,
            "confidence": round(prob, 2),
            "risk_level": "High" if prob > 80 else "Medium" if prob > 50 else "Low"
        }

# -------------------------------
# 4. Example Usage
# -------------------------------
sample_msgs = [
    "Click here to get 10x returns in one week",
    "https://axisbank.com/secure-login",
    "https://ax1sbnk-login.in/verify",
    "Hey, I accidentally sent you an OTP, please share it with me"
]

for msg in sample_msgs:
    result = analyze_message(msg, model, vectorizer)
    print("\n", result)



 {'message': 'Click here to get 10x returns in one week', 'type': 'Text Classification', 'predicted_label': 'Phishing Scam (Link Sharing)', 'confidence': 19.36, 'risk_level': 'Low'}

 {'message': 'https://axisbank.com/secure-login', 'type': 'URL Analysis', 'url': 'https://axisbank.com/secure-login', 'risk_score': 0, 'risk_level': 'Low', 'details': []}

 {'message': 'https://ax1sbnk-login.in/verify', 'type': 'URL Analysis', 'url': 'https://ax1sbnk-login.in/verify', 'risk_score': 20, 'risk_level': 'Low', 'details': ['Suspicious domain name']}

 {'message': 'Hey, I accidentally sent you an OTP, please share it with me', 'type': 'Text Classification', 'predicted_label': 'WhatsApp Account Hacking Scam', 'confidence': 97.24, 'risk_level': 'High'}


In [16]:
import re
import os
import cv2
import joblib
import whois
import datetime
import easyocr
from urllib.parse import urlparse

# ===============================
# Load pretrained text scam model
# ===============================
vectorizer = joblib.load("vectorizer.pkl")  # Your saved TF-IDF vectorizer
model = joblib.load("scam_model.pkl")      # Your trained text classifier

# ===============================
# URL Intelligence Module
# ===============================
def is_suspicious_url(url):
    try:
        domain = urlparse(url).netloc

        # 1. IP instead of domain
        if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain):
            return True

        # 2. Too many subdomains
        if domain.count('.') > 3:
            return True

        # 3. WHOIS check
        try:
            w = whois.whois(domain)
            if w.creation_date:
                creation_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
                age_days = (datetime.datetime.now() - creation_date).days
                if age_days < 90:  # very new domain
                    return True
        except:
            return True  # if WHOIS fails, assume suspicious

        # 4. Suspicious TLDs
        if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop", ".online"]):
            return True

        return False
    except:
        return True  # default suspicious if parsing fails


# ===============================
# Text Classification
# ===============================
def classify_text_message(text):
    X = vectorizer.transform([text])
    prediction = model.predict(X)[0]
    return prediction


# ===============================
# QR Code Check (OpenCV)
# ===============================
def check_qr_codes(image_path):
    img = cv2.imread(image_path)
    qr_detector = cv2.QRCodeDetector()

    data, points, _ = qr_detector.detectAndDecode(img)
    results = []

    if data:
        if data.startswith("http"):
            suspicious = is_suspicious_url(data)
            results.append({"qr_data": data, "suspicious": suspicious})
        else:
            results.append({"qr_data": data, "suspicious": False})

    return results


# ===============================
# OCR + Image Analysis
# ===============================
reader = easyocr.Reader(['en'])

def analyze_image(image_path):
    result = {"ocr_text": "", "text_prediction": None, "qr_results": []}

    # OCR text
    ocr_results = reader.readtext(image_path)
    extracted_text = " ".join([res[1] for res in ocr_results])
    result["ocr_text"] = extracted_text

    if extracted_text.strip():
        result["text_prediction"] = classify_text_message(extracted_text)

    # QR Code check
    qr_analysis = check_qr_codes(image_path)
    result["qr_results"] = qr_analysis

    return result


# ===============================
# Unified Fraud Detection
# ===============================
def detect_fraud(message, image_path=None):
    output = {
        "message": message,
        "url_flags": [],
        "text_prediction": None,
        "image_analysis": None
    }

    # 1. Text-based classification
    if message.strip():
        output["text_prediction"] = classify_text_message(message)

    # 2. Check for URLs inside message
    urls = re.findall(r'(https?://\S+)', message)
    for url in urls:
        suspicious = is_suspicious_url(url)
        output["url_flags"].append({"url": url, "suspicious": suspicious})

    # 3. Image-based analysis (OCR + QR)
    if image_path and os.path.exists(image_path):
        output["image_analysis"] = analyze_image(image_path)

    return output


In [18]:
# Example 1: Text with URL
msg1 = "Congratulations! You have won ₹1,00,000. Click here http://win-big-today.biz"
print(detect_fraud(msg1))

# Example 2: Safe text
msg2 = "Let's meet at 6 pm near the cafe."
print(detect_fraud(msg2))

# Example 3: Image with QR or scam text
msg3 = "Scan this QR to claim your prize!"
print(detect_fraud(msg3, image_path="/kaggle/input/scam-qr-png/WhatsApp Image 2025-09-09 at 13.07.59_522543d6.jpg"))

{'message': 'Congratulations! You have won ₹1,00,000. Click here http://win-big-today.biz', 'url_flags': [{'url': 'http://win-big-today.biz', 'suspicious': True}], 'text_prediction': 'Phishing Scam (Link Sharing)', 'image_analysis': None}
{'message': "Let's meet at 6 pm near the cafe.", 'url_flags': [], 'text_prediction': 'Friend in Distress Scam', 'image_analysis': None}
{'message': 'Scan this QR to claim your prize!', 'url_flags': [], 'text_prediction': 'Friend in Distress Scam', 'image_analysis': {'ocr_text': "PhonePe ACCEPTED HERE Scan & Using PhonePe App 4 Rajarshi Somvanshi 2025, All rights reserved, PhonePe Ltd (Formerly known as 'PhonePe Private Ltd') Pay", 'text_prediction': 'Fake Job Offer Scam', 'qr_results': []}}


In [20]:
import re
from collections import defaultdict, Counter

def parse_whatsapp_chat(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Pattern: "05/09/25, 16:59 - Name: Message"
            match = re.match(r'(\d{2}/\d{2}/\d{2}), (\d{2}:\d{2}) - (.*?): (.*)', line)
            if match:
                date, time, user, msg = match.groups()
                messages.append({'date': date, 'time': time, 'user': user, 'message': msg})
            else:
                # Handle media-only lines or continuation of previous message
                if messages:
                    messages[-1]['message'] += " " + line.strip()
    return messages

def detect_bot_behavior(messages, admin_name):
    user_messages = defaultdict(list)
    for msg in messages:
        user_messages[msg['user']].append(msg['message'])

    bot_scores = {}
    for user, msgs in user_messages.items():
        if user == admin_name:
            continue
        repetitive_praise = sum(1 for m in msgs if any(word in m.lower() for word in ['good', 'great', 'thanks', 'ok', 'okay', 'bruh', 'ya']))
        links = sum(1 for m in msgs if re.search(r'http[s]?://', m))
        media_spam = sum(1 for m in msgs if '<media omitted>' in m.lower())
        high_volume = len(msgs)
        bot_scores[user] = {
            'repetitive_praise': repetitive_praise,
            'links': links,
            'media_spam': media_spam,
            'message_count': high_volume
        }
    
    return bot_scores

def summarize_chat(messages):
    # Join only the actual text messages, ignoring media
    all_text = ' '.join([m['message'] for m in messages if '<media omitted>' not in m['message'].lower()]).lower()
    keywords = Counter(re.findall(r'\b\w+\b', all_text))
    most_common = [k for k, v in keywords.most_common(5)]
    return f"Chat mainly discussed topics related to: {', '.join(most_common[:5])}."

# === Example usage ===
file_path = '/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt'  # your .txt file
admin_name = 'K₹SNA (Vibhu)'     # input the admin name

all_messages = parse_whatsapp_chat(file_path)
bot_analysis = detect_bot_behavior(all_messages, admin_name)
summary = summarize_chat(all_messages)

print("Bot-like behavior analysis:")
for user, scores in bot_analysis.items():
    print(f"{user}: {scores}")

print("\nChat Summary:")
print(summary)


Bot-like behavior analysis:
Aditya Kumar Singh VIT: {'repetitive_praise': 71, 'links': 2, 'media_spam': 141, 'message_count': 556}
Rajarshi Somvanshi: {'repetitive_praise': 149, 'links': 22, 'media_spam': 90, 'message_count': 850}

Chat Summary:
Chat mainly discussed topics related to: the, to, bhai, self, and.
