In [1]:
# ====================================================
# WhatsApp Scam Detection - Baseline Model
# ====================================================

# 1. Imports
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 2. Load dataset
df = pd.read_csv("/kaggle/input/whatsapp-scam/whatsapp_scam_dataset.csv")

print("Dataset shape:", df.shape)
print(df.head())

# 3. Define features & labels
X = df["message"]                # input text
y = df["scam_type"]              # multiclass labels (phishing, fake loan, etc.)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Text vectorization
vectorizer = TfidfVectorizer(
    stop_words="english", 
    max_features=5000,    # keep it light
    ngram_range=(1,2)     # unigrams + bigrams
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 6. Train model
model = LogisticRegression(max_iter=200, class_weight="balanced")
model.fit(X_train_vec, y_train)

# 7. Evaluate
y_pred = model.predict(X_test_vec)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 8. Test with new messages
sample_msgs = [
    "Click here to win 10000 instantly"
]

sample_vec = vectorizer.transform(sample_msgs)
predictions = model.predict(sample_vec)

for msg, pred in zip(sample_msgs, predictions):
    print(f"\nMessage: {msg}\nPredicted Scam Type: {pred}")

# ====================================================
# 9. Save the model and vectorizer
# ====================================================
joblib.dump(model, "scam_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("\n✅ Model and vectorizer saved successfully!")


Dataset shape: (10000, 7)
   id                      scam_type  \
0   1   Phishing Scam (Link Sharing)   
1   2      Fake Discount/Refund Scam   
2   3        Fake Loan Approval Scam   
3   4  WhatsApp Account Hacking Scam   
4   5           Fake E-commerce Scam   

                                             message  \
0  Your Axis account is at risk. Click here to ve...   
1  You are eligible for a ₹53597 refund from Chad...   
2  You are pre-approved for a ₹77178 loan. Pay ₹6...   
3  Hey, this is Bhamini. I accidentally sent my O...   
4  Get Smartwatch for just ₹82437. DM on WhatsApp...   

                                         description language  \
0  Phishing links mimic legitimate websites and t...  English   
1  Scammers impersonate companies and request ban...  English   
2  Scammers offer fake loans and ask for an upfro...  English   
3  A scammer pretends to be a friend and tricks v...  English   
4  Fraudsters advertise too-good-to-be-true deals...  English   

    t

In [2]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predictions
y_pred = model.predict(X_test_vec)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {acc:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Accuracy: 1.00

Classification Report:
                                precision    recall  f1-score   support

Cryptocurrency Investment Scam       1.00      1.00      1.00       147
    Fake Charity/Donation Scam       1.00      1.00      1.00       142
     Fake Discount/Refund Scam       1.00      1.00      1.00       137
          Fake E-commerce Scam       1.00      1.00      1.00       149
           Fake Job Offer Scam       1.00      1.00      1.00       149
       Fake Loan Approval Scam       1.00      1.00      1.00       138
   Fake Technical Support Scam       1.00      1.00      1.00       142
       Friend in Distress Scam       1.00      1.00      1.00       133
  Phishing Scam (Link Sharing)       1.00      1.00      1.00       144
     SIM Card Replacement Scam       1.00      1.00      1.00       149
               Tax Refund Scam       1.00      1.00      1.00       133
                      UPI Scam       1.00      1.00      1.00       145
 WhatsApp Account

In [3]:
!pip install python-whois pyzbar

Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Collecting pyzbar
  Downloading pyzbar-0.1.9-py2.py3-none-any.whl.metadata (10 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyzbar-0.1.9-py2.py3-none-any.whl (32 kB)
Installing collected packages: pyzbar, python-whois
Successfully installed python-whois-0.9.5 pyzbar-0.1.9


In [4]:
import re
import socket
import whois
import datetime
from urllib.parse import urlparse

# -------------------------------
# 1. URL Extraction Helper
# -------------------------------
def extract_url(text):
    url_pattern = r"(https?://\S+|www\.\S+)"
    urls = re.findall(url_pattern, text)
    return urls[0] if urls else None

# -------------------------------
# 2. URL Intelligence Module
# -------------------------------
def check_url_risk(url):
    score = 0
    reasons = []
    parsed = urlparse(url)
    
    # Rule 1: Check HTTPS
    if parsed.scheme != "https":
        score += 30
        reasons.append("No HTTPS detected")
    
    # Rule 2: Suspicious domain (too long or has numbers)
    domain = parsed.netloc
    if any(char.isdigit() for char in domain) or len(domain) > 30:
        score += 20
        reasons.append("Suspicious domain name")
    
    # Rule 3: Domain age
    try:
        domain_info = whois.whois(domain)
        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date
        
        if creation_date:
            age_days = (datetime.datetime.now() - creation_date).days
            if age_days < 180:  # less than 6 months old
                score += 30
                reasons.append("Newly registered domain")
    except Exception as e:
        score += 10
        reasons.append("Domain info unavailable")
    
    # Final decision
    risk_level = "Low"
    if score >= 60:
        risk_level = "High"
    elif score >= 30:
        risk_level = "Medium"
    
    return {
        "risk_score": score,
        "risk_level": risk_level,
        "reasons": reasons
    }

# -------------------------------
# 3. Hybrid Fraud Detection
# -------------------------------
def analyze_message(text, model, vectorizer):
    url = extract_url(text)
    
    if url:  # Case 1: URL present
        url_result = check_url_risk(url)
        return {
            "message": text,
            "type": "URL Analysis",
            "url": url,
            "risk_score": url_result["risk_score"],
            "risk_level": url_result["risk_level"],
            "details": url_result["reasons"]
        }
    
    else:  # Case 2: No URL → run NLP scam classifier
        vec = vectorizer.transform([text])
        pred = model.predict(vec)[0]
        prob = model.predict_proba(vec).max() * 100
        
        return {
            "message": text,
            "type": "Text Classification",
            "predicted_label": pred,
            "confidence": round(prob, 2),
            "risk_level": "High" if prob > 80 else "Medium" if prob > 50 else "Low"
        }

# -------------------------------
# 4. Example Usage
# -------------------------------
sample_msgs = [
    "Click here to get 10x returns in one week",
    "https://axisbank.com/secure-login",
    "https://ax1sbnk-login.in/verify",
    "Hey, I accidentally sent you an OTP, please share it with me"
]

for msg in sample_msgs:
    result = analyze_message(msg, model, vectorizer)
    print("\n", result)



 {'message': 'Click here to get 10x returns in one week', 'type': 'Text Classification', 'predicted_label': 'Phishing Scam (Link Sharing)', 'confidence': 19.36, 'risk_level': 'Low'}

 {'message': 'https://axisbank.com/secure-login', 'type': 'URL Analysis', 'url': 'https://axisbank.com/secure-login', 'risk_score': 0, 'risk_level': 'Low', 'details': []}

 {'message': 'https://ax1sbnk-login.in/verify', 'type': 'URL Analysis', 'url': 'https://ax1sbnk-login.in/verify', 'risk_score': 20, 'risk_level': 'Low', 'details': ['Suspicious domain name']}

 {'message': 'Hey, I accidentally sent you an OTP, please share it with me', 'type': 'Text Classification', 'predicted_label': 'WhatsApp Account Hacking Scam', 'confidence': 97.24, 'risk_level': 'High'}


In [5]:
import re
import os
import cv2
import joblib
import whois
import datetime
import easyocr
from urllib.parse import urlparse

# ===============================
# Load pretrained text scam model
# ===============================
vectorizer = joblib.load("vectorizer.pkl")  # Your saved TF-IDF vectorizer
model = joblib.load("scam_model.pkl")      # Your trained text classifier

# ===============================
# URL Intelligence Module
# ===============================
def is_suspicious_url(url):
    try:
        domain = urlparse(url).netloc

        # 1. IP instead of domain
        if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain):
            return True

        # 2. Too many subdomains
        if domain.count('.') > 3:
            return True

        # 3. WHOIS check
        try:
            w = whois.whois(domain)
            if w.creation_date:
                creation_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
                age_days = (datetime.datetime.now() - creation_date).days
                if age_days < 90:  # very new domain
                    return True
        except:
            return True  # if WHOIS fails, assume suspicious

        # 4. Suspicious TLDs
        if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop", ".online"]):
            return True

        return False
    except:
        return True  # default suspicious if parsing fails


# ===============================
# Text Classification
# ===============================
def classify_text_message(text):
    X = vectorizer.transform([text])
    prediction = model.predict(X)[0]
    return prediction


# ===============================
# QR Code Check (OpenCV)
# ===============================
def check_qr_codes(image_path):
    img = cv2.imread(image_path)
    qr_detector = cv2.QRCodeDetector()

    data, points, _ = qr_detector.detectAndDecode(img)
    results = []

    if data:
        if data.startswith("http"):
            suspicious = is_suspicious_url(data)
            results.append({"qr_data": data, "suspicious": suspicious})
        else:
            results.append({"qr_data": data, "suspicious": False})

    return results


# ===============================
# OCR + Image Analysis
# ===============================
reader = easyocr.Reader(['en'])

def analyze_image(image_path):
    result = {"ocr_text": "", "text_prediction": None, "qr_results": []}

    # OCR text
    ocr_results = reader.readtext(image_path)
    extracted_text = " ".join([res[1] for res in ocr_results])
    result["ocr_text"] = extracted_text

    if extracted_text.strip():
        result["text_prediction"] = classify_text_message(extracted_text)

    # QR Code check
    qr_analysis = check_qr_codes(image_path)
    result["qr_results"] = qr_analysis

    return result


# ===============================
# Unified Fraud Detection
# ===============================
def detect_fraud(message, image_path=None):
    output = {
        "message": message,
        "url_flags": [],
        "text_prediction": None,
        "image_analysis": None
    }

    # 1. Text-based classification
    if message.strip():
        output["text_prediction"] = classify_text_message(message)

    # 2. Check for URLs inside message
    urls = re.findall(r'(https?://\S+)', message)
    for url in urls:
        suspicious = is_suspicious_url(url)
        output["url_flags"].append({"url": url, "suspicious": suspicious})

    # 3. Image-based analysis (OCR + QR)
    if image_path and os.path.exists(image_path):
        output["image_analysis"] = analyze_image(image_path)

    return output


In [6]:
# Example 1: Text with URL
msg1 = "Congratulations! You have won ₹1,00,000. Click here http://win-big-today.biz"
print(detect_fraud(msg1))

# Example 2: Safe text
msg2 = "Let's meet at 6 pm near the cafe."
print(detect_fraud(msg2))

# Example 3: Image with QR or scam text
msg3 = "Scan this QR to claim your prize!"
print(detect_fraud(msg3, image_path="/kaggle/input/scam-qr-png/WhatsApp Image 2025-09-09 at 13.07.59_522543d6.jpg"))

{'message': 'Congratulations! You have won ₹1,00,000. Click here http://win-big-today.biz', 'url_flags': [{'url': 'http://win-big-today.biz', 'suspicious': True}], 'text_prediction': 'Phishing Scam (Link Sharing)', 'image_analysis': None}
{'message': "Let's meet at 6 pm near the cafe.", 'url_flags': [], 'text_prediction': 'Friend in Distress Scam', 'image_analysis': None}
{'message': 'Scan this QR to claim your prize!', 'url_flags': [], 'text_prediction': 'Friend in Distress Scam', 'image_analysis': {'ocr_text': "PhonePe ACCEPTED HERE Scan & Using PhonePe App 4 Rajarshi Somvanshi 2025, All rights reserved, PhonePe Ltd (Formerly known as 'PhonePe Private Ltd') Pay", 'text_prediction': 'Fake Job Offer Scam', 'qr_results': []}}


In [7]:
import re
from collections import defaultdict, Counter

def parse_whatsapp_chat(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Pattern: "05/09/25, 16:59 - Name: Message"
            match = re.match(r'(\d{2}/\d{2}/\d{2}), (\d{2}:\d{2}) - (.*?): (.*)', line)
            if match:
                date, time, user, msg = match.groups()
                messages.append({'date': date, 'time': time, 'user': user, 'message': msg})
            else:
                # Handle media-only lines or continuation of previous message
                if messages:
                    messages[-1]['message'] += " " + line.strip()
    return messages

def detect_bot_behavior(messages, admin_name):
    user_messages = defaultdict(list)
    for msg in messages:
        user_messages[msg['user']].append(msg['message'])

    bot_scores = {}
    for user, msgs in user_messages.items():
        if user == admin_name:
            continue
        repetitive_praise = sum(1 for m in msgs if any(word in m.lower() for word in ['good', 'great', 'thanks', 'ok', 'okay', 'bruh', 'ya']))
        links = sum(1 for m in msgs if re.search(r'http[s]?://', m))
        media_spam = sum(1 for m in msgs if '<media omitted>' in m.lower())
        high_volume = len(msgs)
        bot_scores[user] = {
            'repetitive_praise': repetitive_praise,
            'links': links,
            'media_spam': media_spam,
            'message_count': high_volume
        }
    
    return bot_scores

def summarize_chat(messages):
    # Join only the actual text messages, ignoring media
    all_text = ' '.join([m['message'] for m in messages if '<media omitted>' not in m['message'].lower()]).lower()
    keywords = Counter(re.findall(r'\b\w+\b', all_text))
    most_common = [k for k, v in keywords.most_common(5)]
    return f"Chat mainly discussed topics related to: {', '.join(most_common[:5])}."

# === Example usage ===
file_path = '/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt'  # your .txt file
admin_name = 'K₹SNA (Vibhu)'     # input the admin name

all_messages = parse_whatsapp_chat(file_path)
bot_analysis = detect_bot_behavior(all_messages, admin_name)
summary = summarize_chat(all_messages)

print("Bot-like behavior analysis:")
for user, scores in bot_analysis.items():
    print(f"{user}: {scores}")

print("\nChat Summary:")
print(summary)


Bot-like behavior analysis:
Aditya Kumar Singh VIT: {'repetitive_praise': 71, 'links': 2, 'media_spam': 141, 'message_count': 556}
Rajarshi Somvanshi: {'repetitive_praise': 149, 'links': 22, 'media_spam': 90, 'message_count': 850}

Chat Summary:
Chat mainly discussed topics related to: the, to, bhai, self, and.


In [8]:
# ==============================================================================
# SECTION 1: IMPORTS & INITIAL SETUP
# ==============================================================================
import os
import re
import glob
import json
import datetime
from collections import Counter, defaultdict
from urllib.parse import urlparse

# Suppress warnings and initialize libraries that are verbose on startup
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import warnings
warnings.filterwarnings("ignore")

import joblib
import pandas as pd
import numpy as np
import cv2
import easyocr
import whois
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("✅ All libraries loaded successfully.")

# Define paths for the writable Kaggle directory
WORKING_DIR = "/kaggle/working/"
VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")

# ==============================================================================
# SECTION 2: PLACEHOLDER MODEL CREATION
# ==============================================================================
def create_placeholder_models():
    """Creates dummy model and vectorizer files in /kaggle/working/ if they don't exist."""
    if not os.path.exists(VECTORIZER_PATH) or not os.path.exists(MODEL_PATH):
        print("🔧 Model files not found. Creating placeholder models...")
        dummy_data = {
            'message': ['claim your prize now', 'hi how are you', 'free money click here', 'send me the otp', 'let us meet tomorrow'],
            'scam_type': ['lottery_scam', 'not_scam', 'lottery_scam', 'hacking_scam', 'not_scam']
        }
        df = pd.DataFrame(dummy_data)
        vectorizer = TfidfVectorizer()
        X_vec = vectorizer.fit_transform(df['message'])
        model = LogisticRegression()
        model.fit(X_vec, df['scam_type'])
        joblib.dump(vectorizer, VECTORIZER_PATH)
        joblib.dump(model, MODEL_PATH)
        print(f"✅ Placeholder models created and saved in {WORKING_DIR}.")

# ==============================================================================
# SECTION 3: CORE INTELLIGENCE MODULES
# ==============================================================================
def check_url_risk(url):
    """Analyzes a URL for suspicious characteristics."""
    score, reasons = 0, []
    try:
        domain = urlparse(url).netloc
        if not url.startswith("https"): score += 30; reasons.append("URL is not secure (HTTP)")
        if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain): score += 40; reasons.append("URL uses an IP address")
        if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]): score += 25; reasons.append("URL uses a suspicious TLD")
        try:
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            if creation_date:
                creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                age_days = (datetime.datetime.now() - creation_date).days
                if age_days < 180: score += 30; reasons.append(f"Domain is very new ({age_days} days old)")
        except Exception: score += 10; reasons.append("WHOIS lookup failed")
    except Exception: return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
    risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
    return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

def check_qr_code(image_path):
    """Detects QR codes in an image and analyzes the decoded data."""
    try:
        img = cv2.imread(image_path)
        qr_detector = cv2.QRCodeDetector()
        data, _, _ = qr_detector.detectAndDecode(img)
        if data:
            if data.startswith("http"):
                return {"qr_data": data, "analysis": check_url_risk(data)}
            else:
                return {"qr_data": data, "analysis": {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}}
    except Exception as e:
        return {"error": f"QR processing failed: {str(e)}"}
    return None

def analyze_image_content(image_path, reader, text_model, text_vectorizer):
    """Performs OCR and QR analysis on an image."""
    results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": check_qr_code(image_path)}
    try:
        extracted_text = " ".join(reader.readtext(image_path, detail=0, paragraph=True))
        if extracted_text.strip():
            results["ocr_text"] = extracted_text
            vec = text_vectorizer.transform([extracted_text])
            results["text_scam_prediction"] = text_model.predict(vec)[0]
    except Exception as e:
        results["ocr_text"] = f"OCR failed: {str(e)}"
    return results

def verify_sebi_id(sebi_id):
    """!! PLACEHOLDER !! Verifies a SEBI registration ID."""
    print(f"⚠️  Running placeholder for SEBI ID: {sebi_id}")
    return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 5 else "Not Found (Dummy)"}

def analyze_stock_mentions(text):
    """!! PLACEHOLDER !! Finds stock symbols and checks for suspicious patterns."""
    stock_symbols = re.findall(r'\b[A-Z]{3,}\b', text)
    if not stock_symbols: return None
    print(f"⚠️  Running placeholder for stock analysis on: {stock_symbols}")
    suspicious_patterns = ["Message contains pump-and-dump keywords."] if any(kw in text.lower() for kw in ["guaranteed", "10x", "insider"]) else []
    return {"mentioned_stocks": list(set(stock_symbols)), "suspicious_patterns": suspicious_patterns}

def parse_whatsapp_chat(file_path):
    """Parses a WhatsApp .txt chat file into a list of messages."""
    messages = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}), (\d{2}:\d{2})\s-\s(.*?):\s(.*)', line)
            if match:
                messages.append({'user': match.group(3).strip(), 'message': match.group(4).strip()})
    return messages

def detect_bot_behavior(messages):
    """Analyzes chat messages for bot-like activity."""
    user_counts = Counter(msg['user'] for msg in messages)
    if not user_counts: return {"error": "No messages found."}
    admin_name = user_counts.most_common(1)[0][0]
    bot_indicators = defaultdict(lambda: {"praise_count": 0})
    for msg in messages:
        user = msg['user']
        if admin_name.lower() in msg['message'].lower() and any(praise in msg['message'].lower() for praise in ['good', 'great', 'thanks', 'amazing']):
            bot_indicators[user]["praise_count"] += 1
    potential_bots = {u: d for u, d in bot_indicators.items() if d["praise_count"] > 2 and u != admin_name}
    return {"detected_admin": admin_name, "potential_bot_activity": potential_bots or "None detected."}

# ==============================================================================
# SECTION 4: THE MAIN PIPELINE
# ==============================================================================
def run_full_pipeline(chat_file_path, image_file_paths):
    """Main function to run the entire fraud detection pipeline."""
    try:
        vectorizer = joblib.load(VECTORIZER_PATH)
        model = joblib.load(MODEL_PATH)
        reader = easyocr.Reader(['en'])
    except Exception as e: return {"error": f"Failed to load models or OCR: {e}"}

    final_report = {
        "summary": {}, "chat_analysis": {},
        "message_by_message_analysis": [], "image_analysis": []
    }

    print(f"💬 Analyzing chat file: {os.path.basename(chat_file_path)}...")
    messages = parse_whatsapp_chat(chat_file_path)
    final_report["chat_analysis"] = detect_bot_behavior(messages)

    print(f"🔬 Performing deep analysis on {len(messages)} messages...")
    scam_predictions = []
    for msg_data in messages:
        text = msg_data["message"]
        analysis = {"text_scam_prediction": model.predict(vectorizer.transform([text]))[0]}
        scam_predictions.append(analysis["text_scam_prediction"])
        if re.search(r'(https?://\S+)', text):
            analysis["url_analysis"] = [check_url_risk(url) for url in re.findall(r'(https?://\S+)', text)]
        if re.search(r'SEBI\s*:\s*([A-Z0-9]+)', text, re.I):
            analysis["sebi_id_verification"] = [verify_sebi_id(sid) for sid in re.findall(r'SEBI\s*:\s*([A-Z0-9]+)', text, re.I)]
        if analyze_stock_mentions(text):
            analysis["stock_mention_analysis"] = analyze_stock_mentions(text)
        if len(analysis) > 1:
            final_report["message_by_message_analysis"].append({"user": msg_data["user"], "message": text, "analysis": analysis})

    print(f"🖼️  Analyzing {len(image_file_paths)} image file(s)...")
    for img_path in image_file_paths:
        final_report["image_analysis"].append({
            "file_name": os.path.basename(img_path),
            "analysis": analyze_image_content(img_path, reader, model, vectorizer)
        })

    final_report["summary"] = {
        "total_messages_analyzed": len(messages),
        "total_images_analyzed": len(image_file_paths),
        "scam_types_detected": dict(Counter(p for p in scam_predictions if p != "not_scam"))
    }
    
    print("\n✅ Pipeline finished successfully!")
    return final_report

# ==============================================================================
# SECTION 5: SCRIPT EXECUTION FOR KAGGLE
# ==============================================================================

# --- 🔽 IMPORTANT: SET YOUR FILE PATHS HERE 🔽 ---

# 1. Replace with the path to your uploaded chat TXT file
chat_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt" # <-- CHANGE THIS

# 2. Replace with the path(s) to your uploaded PNG/JPG image(s)
#    Put file paths inside the square brackets, separated by commas.
image_files = [] # <-- CHANGE THIS, e.g., ["/kaggle/input/my-images/scam-image.png"]

# --- 🔼 NO MORE CHANGES NEEDED BELOW THIS LINE 🔼 ---

# Create dummy models if real ones are not available in /kaggle/working/
create_placeholder_models()

# Run the main pipeline
if os.path.exists(chat_file):
    results = run_full_pipeline(chat_file_path=chat_file, image_file_paths=image_files)
    
    # Print the final report as a clean JSON
    print("\n" + "="*50)
    print("          FINAL FRAUD DETECTION REPORT")
    print("="*50 + "\n")
    print(json.dumps(results, indent=4))
else:
    print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")
    print("Please check the path in the 'SCRIPT EXECUTION FOR KAGGLE' section.")

✅ All libraries loaded successfully.
💬 Analyzing chat file: WhatsApp Chat with TriDevs.txt...
🔬 Performing deep analysis on 2246 messages...
⚠️  Running placeholder for stock analysis on: ['SNA', 'NLP']
⚠️  Running placeholder for stock analysis on: ['SNA', 'NLP']
⚠️  Running placeholder for stock analysis on: ['RAG']
⚠️  Running placeholder for stock analysis on: ['RAG']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']


2025-09-09 16:52:19,216 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['PDF']
⚠️  Running placeholder for stock analysis on: ['PDF']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock ana

2025-09-09 16:52:20,785 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock analysis on: ['RAG']
⚠️  Running placeholder for stock analysis on: ['RAG']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['RANGA', 'BHAI']
⚠️  Running placeholder for stock analysi

2025-09-09 16:52:21,507 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['SNA']
⚠️  Running placeholder for stock analysis on: ['MKC']
⚠️  Running placeholder for stock analysis on: ['MKC']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Runnin

2025-09-09 16:52:28,273 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock analysis on: ['VIT']
⚠️  Running placeholder for stock analysis on: ['VIT']


2025-09-09 16:52:39,564 - whois.whois - ERROR - Error trying to connect to socket: closing socket - timed out


⚠️  Running placeholder for stock analysis on: ['HKUST', 'NISL']
⚠️  Running placeholder for stock analysis on: ['HKUST', 'NISL']


2025-09-09 16:52:50,453 - whois.whois - ERROR - Error trying to connect to socket: closing socket - timed out


🖼️  Analyzing 0 image file(s)...

✅ Pipeline finished successfully!

          FINAL FRAUD DETECTION REPORT

{
    "summary": {
        "total_messages_analyzed": 2246,
        "total_images_analyzed": 0,
        "scam_types_detected": {
            "Friend in Distress Scam": 2092,
            "Fake Job Offer Scam": 80,
            "Fake Discount/Refund Scam": 14,
            "Fake Technical Support Scam": 23,
            "Fake Charity/Donation Scam": 1,
            "Phishing Scam (Link Sharing)": 21,
            "Fake E-commerce Scam": 6,
            "Tax Refund Scam": 1,
            "WhatsApp Account Hacking Scam": 8
        }
    },
    "chat_analysis": {
        "detected_admin": "Rajarshi Somvanshi",
        "potential_bot_activity": "None detected."
    },
    "message_by_message_analysis": [
        {
            "user": "Rajarshi Somvanshi",
            "message": "@\u2068K\u20b9SNA (Vibhu)\u2069 hm jab NLP ka use krenge to data train to hm specific function ka hi krenge",
    

In [9]:
# ==============================================================================
# SECTION 1: IMPORTS & INITIAL SETUP
# ==============================================================================
import os
import re
import glob
import json
import datetime
from collections import Counter, defaultdict
from urllib.parse import urlparse
from io import StringIO

# Suppress warnings and initialize libraries that are verbose on startup
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import warnings
warnings.filterwarnings("ignore")

import joblib
import pandas as pd
import numpy as np
import cv2
import easyocr
import whois
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("✅ All libraries loaded successfully.")

# Define paths for the writable Kaggle directory
WORKING_DIR = "/kaggle/working/"
VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")

# ==============================================================================
# SECTION 2: PLACEHOLDER MODEL CREATION
# ==============================================================================
def create_placeholder_models():
    """Creates dummy model and vectorizer files in /kaggle/working/ if they don't exist."""
    if not os.path.exists(VECTORIZER_PATH) or not os.path.exists(MODEL_PATH):
        print("🔧 Model files not found. Creating placeholder models...")
        dummy_data = {
            'message': ['claim your prize now', 'hi how are you', 'free money click here', 'send me the otp', 'let us meet tomorrow'],
            'scam_type': ['lottery_scam', 'not_scam', 'lottery_scam', 'hacking_scam', 'not_scam']
        }
        df = pd.DataFrame(dummy_data)
        vectorizer = TfidfVectorizer()
        X_vec = vectorizer.fit_transform(df['message'])
        model = LogisticRegression()
        model.fit(X_vec, df['scam_type'])
        joblib.dump(vectorizer, VECTORIZER_PATH)
        joblib.dump(model, MODEL_PATH)
        print(f"✅ Placeholder models created and saved in {WORKING_DIR}.")

# ==============================================================================
# SECTION 3: CORE INTELLIGENCE MODULES
# ==============================================================================
def check_url_risk(url):
    """Analyzes a URL for suspicious characteristics."""
    score, reasons = 0, []
    try:
        domain = urlparse(url).netloc
        if not url.startswith("https"): score += 30; reasons.append("URL is not secure (HTTP)")
        if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain): score += 40; reasons.append("URL uses an IP address")
        if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]): score += 25; reasons.append("URL uses a suspicious TLD")
        try:
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            if creation_date:
                creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                age_days = (datetime.datetime.now() - creation_date).days
                if age_days < 180: score += 30; reasons.append(f"Domain is very new ({age_days} days old)")
        except Exception: score += 10; reasons.append("WHOIS lookup failed")
    except Exception: return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
    risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
    return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

def check_qr_code(image_path):
    """Detects QR codes in an image and analyzes the decoded data."""
    try:
        img = cv2.imread(image_path)
        qr_detector = cv2.QRCodeDetector()
        data, _, _ = qr_detector.detectAndDecode(img)
        if data:
            if data.startswith("http"):
                return {"qr_data": data, "analysis": check_url_risk(data)}
            else:
                return {"qr_data": data, "analysis": {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}}
    except Exception as e:
        return {"error": f"QR processing failed: {str(e)}"}
    return None

def analyze_image_content(image_path, reader, text_model, text_vectorizer):
    """Performs OCR and QR analysis on an image."""
    results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": check_qr_code(image_path)}
    try:
        extracted_text = " ".join(reader.readtext(image_path, detail=0, paragraph=True))
        if extracted_text.strip():
            results["ocr_text"] = extracted_text
            vec = text_vectorizer.transform([extracted_text])
            results["text_scam_prediction"] = text_model.predict(vec)[0]
    except Exception as e:
        results["ocr_text"] = f"OCR failed: {str(e)}"
    return results

def verify_sebi_id(sebi_id):
    """!! PLACEHOLDER !! Verifies a SEBI registration ID."""
    if not sebi_id or not sebi_id.strip():
        return {"sebi_id": "Not Provided", "status": "N/A"}
    print(f"⚠️  Running placeholder for SEBI ID: {sebi_id}")
    return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 10 and "INA" in sebi_id else "Not Found (Dummy)"}

def analyze_stock_mentions(text):
    """!! PLACEHOLDER !! Finds stock symbols mentioned in chat text."""
    stock_symbols = re.findall(r'\b[A-Z]{3,}\b', text)
    if not stock_symbols: return None
    print(f"⚠️  Running placeholder for stock mention analysis on: {stock_symbols}")
    suspicious_patterns = ["Message contains pump-and-dump keywords."] if any(kw in text.lower() for kw in ["guaranteed", "10x", "insider"]) else []
    return {"mentioned_stocks": list(set(stock_symbols)), "suspicious_patterns": suspicious_patterns}

def parse_whatsapp_chat(file_path):
    """Parses a WhatsApp .txt chat file into a list of messages."""
    messages = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}), (\d{2}:\d{2})\s-\s(.*?):\s(.*)', line)
            if match:
                messages.append({'user': match.group(3).strip(), 'message': match.group(4).strip()})
    return messages

def detect_bot_behavior(messages):
    """Analyzes chat messages for bot-like activity."""
    user_counts = Counter(msg['user'] for msg in messages)
    if not user_counts: return {"error": "No messages found."}
    admin_name = user_counts.most_common(1)[0][0]
    bot_indicators = defaultdict(lambda: {"praise_count": 0})
    for msg in messages:
        user = msg['user']
        if admin_name.lower() in msg['message'].lower() and any(praise in msg['message'].lower() for praise in ['good', 'great', 'thanks', 'amazing']):
            bot_indicators[user]["praise_count"] += 1
    potential_bots = {u: d for u, d in bot_indicators.items() if d["praise_count"] > 2 and u != admin_name}
    return {"detected_admin": admin_name, "potential_bot_activity": potential_bots or "None detected."}

# --- NEW: Bulk Deal Scraping Functions ---
def get_bse_bulk_deals():
    """Fetches daily bulk deal data from the BSE website."""
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        df_list = pd.read_html(StringIO(response.text))
        for df in df_list:
            if 'Security Name' in df.columns: return df
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ Error fetching BSE bulk deal data: {e}")
        return pd.DataFrame()

def find_trades_by_entity(entity_name):
    """Searches daily bulk deals for transactions by a specific entity."""
    if not entity_name or not entity_name.strip():
        return {"error": "No entity name provided."}
    print(f"🔎 Searching for bulk deals by: '{entity_name}'...")
    deals_df = get_bse_bulk_deals()
    if deals_df.empty: return {"status": "Data not available or could not be parsed", "trades": []}
    entity_trades = deals_df[deals_df['Client Name'].str.contains(entity_name, case=False, na=False)].copy()
    if entity_trades.empty:
        return {"status": f"No bulk deals found for '{entity_name}' in today's data.", "trades": []}
    else:
        return {"status": f"Found {len(entity_trades)} bulk deal(s) for '{entity_name}'.", "trades": entity_trades.to_dict('records')}

# ==============================================================================
# SECTION 4: THE MAIN PIPELINE
# ==============================================================================
def run_full_pipeline(chat_file_path, image_folder_path, sebi_id_to_check, entity_name_to_track):
    """Main function to run the entire fraud detection pipeline."""
    try:
        vectorizer = joblib.load(VECTORIZER_PATH)
        model = joblib.load(MODEL_PATH)
        reader = easyocr.Reader(['en'])
    except Exception as e: return {"error": f"Failed to load models or OCR: {e}"}

    final_report = {
        "summary": {}, "manual_verifications": {}, "bulk_deal_analysis": {},
        "chat_analysis": {}, "message_by_message_analysis": [], "image_analysis": []
    }

    # Perform manual, one-time verifications
    final_report["manual_verifications"]["sebi_id_analysis"] = verify_sebi_id(sebi_id_to_check)
    final_report["bulk_deal_analysis"] = find_trades_by_entity(entity_name_to_track)
    
    # Analyze the chat file
    print(f"💬 Analyzing chat file: {os.path.basename(chat_file_path)}...")
    messages = parse_whatsapp_chat(chat_file_path)
    final_report["chat_analysis"] = detect_bot_behavior(messages)

    print(f"🔬 Performing deep analysis on {len(messages)} messages...")
    scam_predictions = []
    for msg_data in messages:
        text = msg_data["message"]
        analysis = {"text_scam_prediction": model.predict(vectorizer.transform([text]))[0]}
        scam_predictions.append(analysis["text_scam_prediction"])
        if re.search(r'(https?://\S+)', text):
            analysis["url_analysis"] = [check_url_risk(url) for url in re.findall(r'(https?://\S+)', text)]
        if analyze_stock_mentions(text):
            analysis["stock_mention_analysis"] = analyze_stock_mentions(text)
        if len(analysis) > 1:
            final_report["message_by_message_analysis"].append({"user": msg_data["user"], "message": text, "analysis": analysis})

    # Find and analyze all images in the specified folder
    image_files = glob.glob(os.path.join(image_folder_path, "*[.png|.jpg|.jpeg]"))
    print(f"🖼️  Analyzing {len(image_files)} image file(s) from folder...")
    for img_path in image_files:
        final_report["image_analysis"].append({
            "file_name": os.path.basename(img_path),
            "analysis": analyze_image_content(img_path, reader, model, vectorizer)
        })

    final_report["summary"] = {
        "total_messages_analyzed": len(messages),
        "total_images_analyzed": len(image_files),
        "scam_types_detected": dict(Counter(p for p in scam_predictions if p != "not_scam"))
    }
    
    print("\n✅ Pipeline finished successfully!")
    return final_report

# ==============================================================================
# SECTION 5: SCRIPT EXECUTION FOR KAGGLE
# ==============================================================================

# --- 🔽 IMPORTANT: SET YOUR FILE PATHS AND INPUTS HERE 🔽 ---

# 1. Replace with the path to your uploaded chat TXT file
chat_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt" # <-- CHANGE THIS

# 2. Replace with the path to your folder containing the images
image_folder = "/kaggle/input/scam-qr-png" # <-- CHANGE THIS

# 3. Enter the SEBI Registration ID you want to verify
sebi_id_input = "INZ000048660" # <-- CHANGE THIS (or leave as "" if none)

# 4. Enter the registered NAME of the entity to track in bulk deals
entity_name_input = "SUNLIGHT BROKING LLP" # <-- CHANGE THIS (e.g., "Goldman Sachs")

# --- 🔼 NO MORE CHANGES NEEDED BELOW THIS LINE 🔼 ---

# Create dummy models if real ones are not available in /kaggle/working/
create_placeholder_models()

# Run the main pipeline
if os.path.exists(chat_file):
    results = run_full_pipeline(
        chat_file_path=chat_file,
        image_folder_path=image_folder,
        sebi_id_to_check=sebi_id_input,
        entity_name_to_track=entity_name_input
    )
    
    # Print the final report as a clean JSON
    print("\n" + "="*50)
    print("          FINAL FRAUD DETECTION REPORT")
    print("="*50 + "\n")
    print(json.dumps(results, indent=4))
else:
    print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")
    print("Please check the path in the 'SCRIPT EXECUTION FOR KAGGLE' section.")

✅ All libraries loaded successfully.
⚠️  Running placeholder for SEBI ID: INZ000048660
🔎 Searching for bulk deals by: 'SUNLIGHT BROKING LLP'...
💬 Analyzing chat file: WhatsApp Chat with TriDevs.txt...
🔬 Performing deep analysis on 2246 messages...
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'NLP']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'NLP']
⚠️  Running placeholder for stock mention analysis on: ['RAG']
⚠️  Running placeholder for stock mention analysis on: ['RAG']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']


2025-09-09 16:52:56,155 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['PDF']
⚠️  Running placeholder for stock mention analysis on: ['PDF']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock 

2025-09-09 16:52:57,661 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA', 'VIT']
⚠️  Running placeholder for stock mention analysis on: ['RAG']
⚠️  Running placeholder for stock mention analysis on: ['RAG']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention

2025-09-09 16:52:58,410 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['SNA']
⚠️  Running placeholder for stock mention analysis on: ['MKC']
⚠️  Running placeholder for stock mention analysis on: 

2025-09-09 16:53:05,516 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


⚠️  Running placeholder for stock mention analysis on: ['VIT']
⚠️  Running placeholder for stock mention analysis on: ['VIT']


2025-09-09 16:53:16,959 - whois.whois - ERROR - Error trying to connect to socket: closing socket - timed out


⚠️  Running placeholder for stock mention analysis on: ['HKUST', 'NISL']
⚠️  Running placeholder for stock mention analysis on: ['HKUST', 'NISL']


2025-09-09 16:53:27,581 - whois.whois - ERROR - Error trying to connect to socket: closing socket - timed out


🖼️  Analyzing 1 image file(s) from folder...

✅ Pipeline finished successfully!

          FINAL FRAUD DETECTION REPORT

{
    "summary": {
        "total_messages_analyzed": 2246,
        "total_images_analyzed": 1,
        "scam_types_detected": {
            "Friend in Distress Scam": 2092,
            "Fake Job Offer Scam": 80,
            "Fake Discount/Refund Scam": 14,
            "Fake Technical Support Scam": 23,
            "Fake Charity/Donation Scam": 1,
            "Phishing Scam (Link Sharing)": 21,
            "Fake E-commerce Scam": 6,
            "Tax Refund Scam": 1,
            "WhatsApp Account Hacking Scam": 8
        }
    },
    "manual_verifications": {
        "sebi_id_analysis": {
            "sebi_id": "INZ000048660",
            "status": "Not Found (Dummy)"
        }
    },
    "bulk_deal_analysis": {
        "status": "No bulk deals found for 'SUNLIGHT BROKING LLP' in today's data.",
        "trades": []
    },
    "chat_analysis": {
        "detected_admi

In [10]:
import requests
import pandas as pd
from io import StringIO

def get_bulk_deal_client_frequency():
    """
    Fetches daily bulk deal data from the BSE and calculates the frequency
    of each client name involved in the trades.
    """
    print("Fetching daily bulk deal data from BSE...")
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        all_tables = pd.read_html(StringIO(response.text))
        
        deals_df = None
        for table in all_tables:
            if 'Client Name' in table.columns:
                deals_df = table
                break
        
        if deals_df is None or deals_df.empty:
            print("❌ Could not find or parse the bulk deals table today.")
            return None

        # Use value_counts() to get the frequency of each name
        name_counts = deals_df['Client Name'].value_counts()
        print("✅ Successfully calculated client name frequencies.")
        return name_counts

    except Exception as e:
        print(f"❌ An error occurred: {e}")
        return None

if __name__ == "__main__":
    client_frequencies = get_bulk_deal_client_frequency()
    
    if client_frequencies is not None and not client_frequencies.empty:
        print("\n" + "="*50)
        print("Frequency of Client Names in Today's Bulk Deals")
        print("="*50)
        # The result from value_counts() is a pandas Series
        for name, count in client_frequencies.items():
            print(f"- {name}: {count} trade(s)")
    else:
        print("\nNo bulk deals to analyze today.")

Fetching daily bulk deal data from BSE...
✅ Successfully calculated client name frequencies.

Frequency of Client Names in Today's Bulk Deals
- IRAGE BROKING SERVICES LLP: 6 trade(s)
- NEO APEX VENTURE LLP: 5 trade(s)
- MANSI SHARE AND STOCK BROKING PRIVATE LIMITED: 4 trade(s)
- NEO APEX SHARE BROKING SERVICES LLP: 4 trade(s)
- PRAS INVESTMENT PRIVATE LIMITED: 3 trade(s)
- B N RATHI SECURITIES LIMITED: 2 trade(s)
- VARANGA PROPERTIES PRIVATE LIMITED: 2 trade(s)
- SHAILESH DHAMELIYA: 2 trade(s)
- SHARE INDIA SECURITIES LIMITED: 2 trade(s)
- VIKRAMKUMAR KARANRAJ SAKARIA HUF: 2 trade(s)
- PRASHANT GUPTA: 2 trade(s)
- NIRAJ RAJNIKANT SHAH: 2 trade(s)
- NEOMILE CORPORATE ADVISORY PRIVATE LIMITED: 2 trade(s)
- GURVINDER SINGH: 2 trade(s)
- QE SECURITIES LLP: 2 trade(s)
- SAROJDEVI P GUPTA: 2 trade(s)
- AKSHAY PALIWAL: 2 trade(s)
- PARNIT VENTURES PRIVATE LIMITED: 2 trade(s)
- SYLPH TECHNOLOGIES LIMITED: 2 trade(s)
- F3 ADVISORS PRIVATE LIMITED: 2 trade(s)
- CHAUHAN NAGJIBHAI CHANDUBHAI: 2 tr

In [12]:
import requests
import pandas as pd
import re
from io import StringIO
from datetime import datetime

def get_bse_bulk_deals():
    """Fetches and parses daily bulk deal data from the BSE website."""
    print("Fetching daily bulk deal data...")
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        all_tables = pd.read_html(StringIO(response.text))
        for table in all_tables:
            if 'Client Name' in table.columns and 'Deal Date' in table.columns:
                print("✅ Bulk deal data fetched.")
                table.columns = table.columns.str.strip()
                # Convert date columns to datetime objects for comparison
                table['Deal Date'] = pd.to_datetime(table['Deal Date'], format='%d/%m/%Y')
                return table
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ Error fetching BSE bulk deal data: {e}")
        return pd.DataFrame()

def parse_chat_for_advisor_mentions(file_path, advisor_name):
    """
    Parses a chat log to find all stock mentions made by a specific advisor.
    A stock is assumed to be any word in all caps with 3 or more letters.
    """
    print(f"Parsing chat log for messages by '{advisor_name}'...")
    stock_mentions = []
    stock_pattern = re.compile(r'\b[A-Z]{3,}\b')
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Match lines that contain a date, time, and the specific advisor's name
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}),\s\d{2}:\d{2}\s-\s(.*?):\s(.*)', line)
            if match and advisor_name.lower() in match.group(2).lower():
                date_str, user, message = match.groups()
                message_date = datetime.strptime(date_str, '%d/%m/%y') # Assuming dd/mm/yy format
                
                # Find all potential stock symbols in the message
                symbols = stock_pattern.findall(message)
                for symbol in symbols:
                    stock_mentions.append({
                        "date": message_date,
                        "stock_symbol": symbol,
                        "message": message.strip()
                    })
    print(f"✅ Found {len(stock_mentions)} stock mentions by the advisor.")
    return stock_mentions

def cross_correlation_engine(chat_file, advisor_name):
    """
    The main engine to correlate chat mentions with bulk trades.
    """
    # Step 1: Get all recent bulk trades
    trades_df = get_bse_bulk_deals()
    if trades_df.empty:
        return ["Could not retrieve bulk deal data to perform analysis."]

    # Filter for trades made only by our target advisor
    advisor_trades = trades_df[trades_df['Client Name'].str.contains(advisor_name, case=False, na=False)].copy()
    if advisor_trades.empty:
        return [f"No recent bulk deals found for '{advisor_name}'. No correlation possible."]
    
    # Step 2: Get all stock mentions by the advisor from the chat log
    advisor_mentions = parse_chat_for_advisor_mentions(chat_file, advisor_name)
    if not advisor_mentions:
        return [f"No stock mentions found for '{advisor_name}' in the chat log."]
        
    # Step 3: Correlate mentions and trades to find red flags
    print("\nCorrelating trades and chat messages...")
    red_flags = []
    
    for mention in advisor_mentions:
        for _, trade in advisor_trades.iterrows():
            # Check if the mentioned stock name is part of the security name in the trade data
            if mention['stock_symbol'].lower() in trade['Security Name'].lower():
                time_delta = mention['date'] - trade['Deal Date']
                
                # --- Red Flag 1: Front-Running ---
                # Promotion happens within 7 days AFTER a bulk BUY
                if trade['Deal Type'].lower() == 'buy' and 0 <= time_delta.days <= 7:
                    flag = (f"🚨 POTENTIAL FRONT-RUNNING DETECTED:\n"
                            f"  -> Advisor promoted '{mention['stock_symbol']}' on {mention['date'].date()}\n"
                            f"  -> This was just {time_delta.days} day(s) AFTER their bulk BUY of {trade['Quantity']} shares on {trade['Deal Date'].date()}.\n")
                    red_flags.append(flag)

                # --- Red Flag 2: Pump & Dump ---
                # A bulk SELL happens within 30 days AFTER a promotion
                if trade['Deal Type'].lower() == 'sell' and 0 <= time_delta.days <= 30:
                    flag = (f"🚨 POTENTIAL PUMP & DUMP DETECTED:\n"
                            f"  -> Advisor made a bulk SELL of {trade['Quantity']} shares of {trade['Security Name']} on {trade['Deal Date'].date()}\n"
                            f"  -> This was {time_delta.days} day(s) AFTER they promoted '{mention['stock_symbol']}' on {mention['date'].date()}.\n")
                    red_flags.append(flag)

    return red_flags if red_flags else ["No suspicious correlations found between chat messages and recent bulk deals."]

if __name__ == "__main__":
    # --- 🔽 INPUTS FOR THE ANALYSIS 🔽 ---
    # 1. Path to your chat log file
    chat_log_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt" #<-- CHANGE THIS
    
    # 2. The exact name of the advisor as it appears in the chat and bulk deal data
    advisor_name_to_investigate = "Rajarshi Somvanshi" #<-- CHANGE THIS
    
    # --- Run the engine ---
    if not os.path.exists(chat_log_file):
        print(f"❌ ERROR: Chat file not found at '{chat_log_file}'")
    else:
        suspicious_activities = cross_correlation_engine(chat_log_file, advisor_name_to_investigate)
        
        print("\n" + "="*80)
        print("          Cross-Correlation Analysis Report")
        print("="*80)
        for activity in suspicious_activities:
            print(activity)

Fetching daily bulk deal data...
✅ Bulk deal data fetched.

          Cross-Correlation Analysis Report
No recent bulk deals found for 'Rajarshi Somvanshi'. No correlation possible.
