In [2]:
!pip install python-whois pyzbar

Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Collecting pyzbar
  Downloading pyzbar-0.1.9-py2.py3-none-any.whl.metadata (10 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pyzbar-0.1.9-py2.py3-none-any.whl (32 kB)
Installing collected packages: pyzbar, python-whois
Successfully installed python-whois-0.9.5 pyzbar-0.1.9


In [4]:
import requests
import pandas as pd
from io import StringIO

def get_bulk_deal_client_frequency():
    """
    Fetches daily bulk deal data from the BSE and calculates the frequency
    of each client name involved in the trades.
    """
    print("Fetching daily bulk deal data from BSE...")
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        all_tables = pd.read_html(StringIO(response.text))
        
        deals_df = None
        for table in all_tables:
            if 'Client Name' in table.columns:
                deals_df = table
                break
        
        if deals_df is None or deals_df.empty:
            print("❌ Could not find or parse the bulk deals table today.")
            return None

        # Use value_counts() to get the frequency of each name
        name_counts = deals_df['Client Name'].value_counts()
        print("✅ Successfully calculated client name frequencies.")
        return name_counts

    except Exception as e:
        print(f"❌ An error occurred: {e}")
        return None

if __name__ == "__main__":
    client_frequencies = get_bulk_deal_client_frequency()
    
    if client_frequencies is not None and not client_frequencies.empty:
        print("\n" + "="*50)
        print("Frequency of Client Names in Today's Bulk Deals")
        print("="*50)
        # The result from value_counts() is a pandas Series
        for name, count in client_frequencies.items():
            print(f"- {name}: {count} trade(s)")
    else:
        print("\nNo bulk deals to analyze today.")

Fetching daily bulk deal data from BSE...
✅ Successfully calculated client name frequencies.

Frequency of Client Names in Today's Bulk Deals
- IRAGE BROKING SERVICES LLP: 6 trade(s)
- NEO APEX VENTURE LLP: 5 trade(s)
- MANSI SHARE AND STOCK BROKING PRIVATE LIMITED: 4 trade(s)
- NEO APEX SHARE BROKING SERVICES LLP: 4 trade(s)
- PRAS INVESTMENT PRIVATE LIMITED: 3 trade(s)
- PARNIT VENTURES PRIVATE LIMITED: 2 trade(s)
- B N RATHI SECURITIES LIMITED: 2 trade(s)
- AKSHAY PALIWAL: 2 trade(s)
- SYLPH TECHNOLOGIES LIMITED: 2 trade(s)
- SAROJDEVI P GUPTA: 2 trade(s)
- BHATIA NIKHIL MURLIDHAR: 2 trade(s)
- ISHAAN TRADEFIN LLP: 2 trade(s)
- F3 ADVISORS PRIVATE LIMITED: 2 trade(s)
- QE SECURITIES LLP: 2 trade(s)
- VARANGA PROPERTIES PRIVATE LIMITED: 2 trade(s)
- PRASHANT GUPTA: 2 trade(s)
- VIKRAMKUMAR KARANRAJ SAKARIA HUF: 2 trade(s)
- CHAUHAN NAGJIBHAI CHANDUBHAI: 2 trade(s)
- KAMLESH NAVINCHANDRA SHAH: 2 trade(s)
- SHAILESH DHAMELIYA: 2 trade(s)
- SHARE INDIA SECURITIES LIMITED: 2 trade(s)
- N

In [5]:
import requests
import pandas as pd
import re
from io import StringIO
from datetime import datetime

def get_bse_bulk_deals():
    """Fetches and parses daily bulk deal data from the BSE website."""
    print("Fetching daily bulk deal data...")
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        all_tables = pd.read_html(StringIO(response.text))
        for table in all_tables:
            if 'Client Name' in table.columns and 'Deal Date' in table.columns:
                print("✅ Bulk deal data fetched.")
                table.columns = table.columns.str.strip()
                # Convert date columns to datetime objects for comparison
                table['Deal Date'] = pd.to_datetime(table['Deal Date'], format='%d/%m/%Y')
                return table
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ Error fetching BSE bulk deal data: {e}")
        return pd.DataFrame()

def parse_chat_for_advisor_mentions(file_path, advisor_name):
    """
    Parses a chat log to find all stock mentions made by a specific advisor.
    A stock is assumed to be any word in all caps with 3 or more letters.
    """
    print(f"Parsing chat log for messages by '{advisor_name}'...")
    stock_mentions = []
    stock_pattern = re.compile(r'\b[A-Z]{3,}\b')
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Match lines that contain a date, time, and the specific advisor's name
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}),\s\d{2}:\d{2}\s-\s(.*?):\s(.*)', line)
            if match and advisor_name.lower() in match.group(2).lower():
                date_str, user, message = match.groups()
                message_date = datetime.strptime(date_str, '%d/%m/%y') # Assuming dd/mm/yy format
                
                # Find all potential stock symbols in the message
                symbols = stock_pattern.findall(message)
                for symbol in symbols:
                    stock_mentions.append({
                        "date": message_date,
                        "stock_symbol": symbol,
                        "message": message.strip()
                    })
    print(f"✅ Found {len(stock_mentions)} stock mentions by the advisor.")
    return stock_mentions

def cross_correlation_engine(chat_file, advisor_name):
    """
    The main engine to correlate chat mentions with bulk trades.
    """
    # Step 1: Get all recent bulk trades
    trades_df = get_bse_bulk_deals()
    if trades_df.empty:
        return ["Could not retrieve bulk deal data to perform analysis."]

    # Filter for trades made only by our target advisor
    advisor_trades = trades_df[trades_df['Client Name'].str.contains(advisor_name, case=False, na=False)].copy()
    if advisor_trades.empty:
        return [f"No recent bulk deals found for '{advisor_name}'. No correlation possible."]
    
    # Step 2: Get all stock mentions by the advisor from the chat log
    advisor_mentions = parse_chat_for_advisor_mentions(chat_file, advisor_name)
    if not advisor_mentions:
        return [f"No stock mentions found for '{advisor_name}' in the chat log."]
        
    # Step 3: Correlate mentions and trades to find red flags
    print("\nCorrelating trades and chat messages...")
    red_flags = []
    
    for mention in advisor_mentions:
        for _, trade in advisor_trades.iterrows():
            # Check if the mentioned stock name is part of the security name in the trade data
            if mention['stock_symbol'].lower() in trade['Security Name'].lower():
                time_delta = mention['date'] - trade['Deal Date']
                
                # --- Red Flag 1: Front-Running ---
                # Promotion happens within 7 days AFTER a bulk BUY
                if trade['Deal Type'].lower() == 'buy' and 0 <= time_delta.days <= 7:
                    flag = (f"🚨 POTENTIAL FRONT-RUNNING DETECTED:\n"
                            f"  -> Advisor promoted '{mention['stock_symbol']}' on {mention['date'].date()}\n"
                            f"  -> This was just {time_delta.days} day(s) AFTER their bulk BUY of {trade['Quantity']} shares on {trade['Deal Date'].date()}.\n")
                    red_flags.append(flag)

                # --- Red Flag 2: Pump & Dump ---
                # A bulk SELL happens within 30 days AFTER a promotion
                if trade['Deal Type'].lower() == 'sell' and 0 <= time_delta.days <= 30:
                    flag = (f"🚨 POTENTIAL PUMP & DUMP DETECTED:\n"
                            f"  -> Advisor made a bulk SELL of {trade['Quantity']} shares of {trade['Security Name']} on {trade['Deal Date'].date()}\n"
                            f"  -> This was {time_delta.days} day(s) AFTER they promoted '{mention['stock_symbol']}' on {mention['date'].date()}.\n")
                    red_flags.append(flag)

    return red_flags if red_flags else ["No suspicious correlations found between chat messages and recent bulk deals."]

if __name__ == "__main__":
    # --- 🔽 INPUTS FOR THE ANALYSIS 🔽 ---
    # 1. Path to your chat log file
    chat_log_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt" #<-- CHANGE THIS
    
    # 2. The exact name of the advisor as it appears in the chat and bulk deal data
    advisor_name_to_investigate = "NEO APEX VENTURE LLP" #<-- CHANGE THIS
    
    # --- Run the engine ---
    if not os.path.exists(chat_log_file):
        print(f"❌ ERROR: Chat file not found at '{chat_log_file}'")
    else:
        suspicious_activities = cross_correlation_engine(chat_log_file, advisor_name_to_investigate)
        
        print("\n" + "="*80)
        print("          Cross-Correlation Analysis Report")
        print("="*80)
        for activity in suspicious_activities:
            print(activity)

Fetching daily bulk deal data...
✅ Bulk deal data fetched.
Parsing chat log for messages by 'NEO APEX VENTURE LLP'...
✅ Found 0 stock mentions by the advisor.

          Cross-Correlation Analysis Report
No stock mentions found for 'NEO APEX VENTURE LLP' in the chat log.


In [3]:

!pip install pandas scikit-learn joblib opencv-python easyocr python-whois requests beautifulsoup4 html5lib yfinance -q


import os
import re
import glob
import json
import datetime
from collections import Counter, defaultdict
from urllib.parse import urlparse
from io import StringIO

# --- Suppress Warnings ---
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings("ignore")

# --- Core Data Science & ML Imports ---
import joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# --- Image & Web Scraping Imports ---
import cv2
import easyocr
import whois
import requests
import yfinance as yf

print("✅ All libraries loaded successfully.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:

def train_scam_classifier():
    """
    Loads the dataset, trains a text classifier, and saves the model
    and vectorizer to the /kaggle/working/ directory.
    """
    print("🚀 Starting model training process...")
    try:
        # Define paths for the writable Kaggle directory
        WORKING_DIR = "/kaggle/working/"
        VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
        MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")

        # Load dataset
        df = pd.read_csv("/kaggle/input/whatsapp-scam/whatsapp_scam_dataset.csv")

        # Define features & labels
        X = df["message"]
        y = df["scam_type"]

        # Train-test split
        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Text vectorization
        vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
        X_train_vec = vectorizer.fit_transform(X_train)

        # Train model
        model = LogisticRegression(max_iter=200, class_weight="balanced")
        model.fit(X_train_vec, y_train)

        # Save the model and vectorizer
        joblib.dump(vectorizer, VECTORIZER_PATH)
        joblib.dump(model, MODEL_PATH)
        
        print(f"✅ Model and vectorizer saved successfully to {WORKING_DIR}")
        return True

    except Exception as e:
        print(f"❌ An error occurred during model training: {e}")
        print("    Ensure '/kaggle/input/whatsapp-scam/whatsapp_scam_dataset.csv' is added to your notebook.")
        return False

# Run the training
train_scam_classifier()

🚀 Starting model training process...
✅ Model and vectorizer saved successfully to /kaggle/working/


True

In [24]:
# ==============================================================================
# SECTION 3: COMPILED ANALYSIS PIPELINE
# ==============================================================================

# --- Helper Function: URL Intelligence ---
def check_url_risk(url):
    score, reasons = 0, []
    try:
        domain = urlparse(url).netloc
        if not url.startswith("https"): score += 30; reasons.append("URL is not secure (HTTP)")
        if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain): score += 40; reasons.append("URL uses an IP address")
        if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]): score += 25; reasons.append("URL uses a suspicious TLD")
        try:
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            if creation_date:
                creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                age_days = (datetime.datetime.now() - creation_date).days
                if age_days < 180: score += 30; reasons.append(f"Domain is very new ({age_days} days old)")
        except Exception: score += 10; reasons.append("WHOIS lookup failed")
    except Exception: return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
    risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
    return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

# --- Helper Function: Image & QR Intelligence ---
def check_qr_code(image_path):
    try:
        img = cv2.imread(image_path)
        if img is None: return None
        qr_detector = cv2.QRCodeDetector()
        data, _, _ = qr_detector.detectAndDecode(img)
        if data: return {"qr_data": data, "analysis": check_url_risk(data) if data.startswith("http") else {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}}
    except Exception: return None
    return None

def analyze_image_content(image_path, reader, text_model, text_vectorizer):
    results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": check_qr_code(image_path)}
    try:
        extracted_text = " ".join(reader.readtext(image_path, detail=0, paragraph=True))
        if extracted_text.strip():
            results["ocr_text"] = extracted_text
            vec = text_vectorizer.transform([extracted_text])
            results["text_scam_prediction"] = text_model.predict(vec)[0]
    except Exception: pass
    return results

# --- Helper Function: SEBI & Stock Mention (Placeholders) ---
def verify_sebi_id(sebi_id):
    if not sebi_id or not sebi_id.strip(): return {"sebi_id": "Not Provided", "status": "N/A"}
    print(f"⚠️  Running placeholder for SEBI ID: {sebi_id}")
    return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 10 and "INA" in sebi_id else "Not Found (Dummy)"}

def analyze_stock_mentions(text):
    stock_symbols = re.findall(r'\b[A-Z]{3,}\b', text)
    if not stock_symbols: return None
    suspicious_patterns = ["Message contains pump-and-dump keywords."] if any(kw in text.lower() for kw in ["guaranteed", "10x", "insider"]) else []
    return {"mentioned_stocks": list(set(stock_symbols)), "suspicious_patterns": suspicious_patterns}

# --- Helper Function: Chat Analysis ---
def parse_chat_for_advisor_mentions(file_path, advisor_name):
    stock_mentions, stock_pattern = [], re.compile(r'\b[A-Z]{3,}\b')
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}),\s\d{2}:\d{2}\s-\s(.*?):\s(.*)', line)
            if match and advisor_name.lower() in match.group(2).lower():
                date_str, _, message = match.groups()
                try: message_date = datetime.datetime.strptime(date_str, '%d/%m/%y')
                except ValueError: message_date = datetime.datetime.strptime(date_str, '%d/%m/%Y')
                for symbol in stock_pattern.findall(message):
                    stock_mentions.append({"date": message_date, "stock_symbol": symbol, "message": message.strip()})
    return stock_mentions

def detect_bot_behavior(file_path):
    with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines()
    users = [re.match(r'.*?-\s(.*?):', line).group(1).strip() for line in lines if re.match(r'.*?-\s(.*?):', line)]
    if not users: return {"error": "No users found."}
    user_counts = Counter(users)
    admin_name = user_counts.most_common(1)[0][0]
    return {"detected_admin": admin_name, "message_counts": dict(user_counts)}

# --- Helper Function: Bulk Deal & Cross-Correlation ---
def get_bse_bulk_deals():
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        for table in pd.read_html(StringIO(response.text)):
            if 'Client Name' in table.columns:
                table.columns = table.columns.str.strip()
                table['Deal Date'] = pd.to_datetime(table['Deal Date'], format='%d/%m/%Y')
                return table
    except Exception: return pd.DataFrame()
    return pd.DataFrame()

def cross_correlation_engine(chat_file, advisor_name, trades_df):
    if trades_df.empty or not advisor_name: return ["Bulk deal data not available or no advisor name provided."]
    advisor_trades = trades_df[trades_df['Client Name'].str.contains(advisor_name, case=False, na=False)]
    if advisor_trades.empty: return [f"No bulk deals found for '{advisor_name}'."]
    advisor_mentions = parse_chat_for_advisor_mentions(chat_file, advisor_name)
    if not advisor_mentions: return [f"No stock mentions by '{advisor_name}' found in chat."]
    red_flags = []
    for mention in advisor_mentions:
        for _, trade in advisor_trades.iterrows():
            if mention['stock_symbol'].lower() in trade['Security Name'].lower():
                time_delta = mention['date'] - trade['Deal Date']
                if trade['Deal Type'].lower() == 'buy' and 0 <= time_delta.days <= 7:
                    red_flags.append(f"🚨 POTENTIAL FRONT-RUNNING: Advisor promoted '{mention['stock_symbol']}' on {mention['date'].date()} just {time_delta.days} day(s) AFTER a bulk BUY.")
                if trade['Deal Type'].lower() == 'sell' and 0 <= time_delta.days <= 30:
                    red_flags.append(f"🚨 POTENTIAL PUMP & DUMP: Advisor made a bulk SELL of {trade['Security Name']} {time_delta.days} day(s) AFTER promoting '{mention['stock_symbol']}'.")
    return red_flags if red_flags else ["No suspicious correlations found."]

# --- THE MAIN PIPELINE FUNCTION ---
def run_analysis_pipeline(chat_file_path, image_folder_path, sebi_id_to_check, entity_name_to_track):
    """Orchestrates the entire fraud detection and analysis process."""
    print("🚀 Starting Full Analysis Pipeline...")
    WORKING_DIR = "/kaggle/working/"
    VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
    MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")
    try:
        vectorizer = joblib.load(VECTORIZER_PATH)
        model = joblib.load(MODEL_PATH)
        reader = easyocr.Reader(['en'], gpu=False) # Set gpu=False for Kaggle CPU environment
    except Exception as e: return {"error": f"Failed to load models or OCR: {e}"}

    final_report = defaultdict(dict)
    
    # 1. Perform one-time analyses
    print("📈 Analyzing market and chat-level data...")
    final_report["manual_verifications"]["sebi_id_analysis"] = verify_sebi_id(sebi_id_to_check)
    bulk_deals_df = get_bse_bulk_deals()
    final_report["bulk_deal_analysis"] = {"trades_found_today": len(bulk_deals_df)}
    final_report["cross_correlation_analysis"] = cross_correlation_engine(chat_file_path, entity_name_to_track, bulk_deals_df)
    final_report["chat_analysis"] = detect_bot_behavior(chat_file_path)

    # 2. Perform message-by-message and image analysis
    print("🔬 Analyzing individual messages and images...")
    messages = parse_chat_for_advisor_mentions(chat_file_path, "") # Get all messages
    scam_predictions = []
    final_report["message_by_message_analysis"] = []
    for msg_data in messages:
        text = msg_data["message"]
        analysis = {"text_scam_prediction": model.predict(vectorizer.transform([text]))[0]}
        scam_predictions.append(analysis["text_scam_prediction"])
        if re.search(r'(https?://\S+)', text):
            analysis["url_analysis"] = [check_url_risk(url) for url in re.findall(r'(https?://\S+)', text)]
        if analyze_stock_mentions(text):
            analysis["stock_mention_analysis"] = analyze_stock_mentions(text)
        if len(analysis) > 1:
            final_report["message_by_message_analysis"].append({"message": text, "analysis": analysis})
    
    image_files = glob.glob(os.path.join(image_folder_path, "*[.png|.jpg|.jpeg]"))
    final_report["image_analysis"] = []
    for img_path in image_files:
        final_report["image_analysis"].append({"file_name": os.path.basename(img_path), "analysis": analyze_image_content(img_path, reader, model, vectorizer)})

    # 3. Compile summary
    final_report["summary"] = {
        "total_messages_analyzed": len(messages),
        "total_images_analyzed": len(image_files),
        "scam_types_detected": dict(Counter(p for p in scam_predictions if p != "not_scam"))
    }
    print("\n✅ Pipeline finished successfully!")
    return final_report

In [25]:
import os
chat_file = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/chat/Extracted messages....txt"

image_folder = "/kaggle/input/scam-qr-png/"

sebi_id_input = "INZ000048660"

entity_name_input = "NEO APEX VENTURE LLP"

if os.path.exists(chat_file):
    results = run_analysis_pipeline(
        chat_file_path=chat_file,
        image_folder_path=image_folder,
        sebi_id_to_check=sebi_id_input,
        entity_name_to_track=entity_name_input
    )
    
    # Print the final report as a clean JSON
    print("\n" + "="*50)
    print("          FINAL FRAUD DETECTION REPORT")
    print("="*50 + "\n")
    print(json.dumps(results, indent=4))
else:
    print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")

🚀 Starting Full Analysis Pipeline...
📈 Analyzing market and chat-level data...
⚠️  Running placeholder for SEBI ID: INZ000048660
🔬 Analyzing individual messages and images...

✅ Pipeline finished successfully!

          FINAL FRAUD DETECTION REPORT

{
    "manual_verifications": {
        "sebi_id_analysis": {
            "sebi_id": "INZ000048660",
            "status": "Not Found (Dummy)"
        }
    },
    "bulk_deal_analysis": {
        "trades_found_today": 123
    },
    "cross_correlation_analysis": [
        "No stock mentions by 'NEO APEX VENTURE LLP' found in chat."
    ],
    "chat_analysis": {
        "detected_admin": "vibhugupta555",
        "message_counts": {
            "Axin852000": 6,
            "Cryptohackfi": 28,
            "kalamisg010": 8,
            "Dabba_cliets_data_provider": 11,
            "Vikas_9010": 14,
            "None": 115,
            "MEPAYhao": 6,
            "cryptus455": 25,
            "vibhugupta555": 201,
            "Mcxgoldliveresearc

In [4]:
import os
import re
import json
import joblib
import whois
import cv2
import easyocr
import requests
import datetime
import pandas as pd
from io import StringIO
from urllib.parse import urlparse
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import glob


class FraudDetectionPipeline:
    """
    A self-contained class to run the multi-modal scam detection pipeline.
    """

    def __init__(self, model_path, vectorizer_path):
        """Load ML model, vectorizer, and OCR reader."""
        print("🚀 Initializing Fraud Detection Pipeline...")
        try:
            self.model = joblib.load(model_path)
            self.vectorizer = joblib.load(vectorizer_path)
            self.ocr_reader = easyocr.Reader(['en'], gpu=False)  # always CPU in Kaggle
            print("✅ Pipeline initialized successfully.")
        except Exception as e:
            raise IOError(
                f"❌ Failed to load models. Ensure files exist at the specified paths. Error: {e}"
            )

    def _check_url_risk(self, url):
        score, reasons = 0, []
        try:
            domain = urlparse(url).netloc
            if not url.startswith("https"):
                score += 30
                reasons.append("URL is not secure (HTTP)")
            if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain):
                score += 40
                reasons.append("URL uses an IP address")
            if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]):
                score += 25
                reasons.append("URL uses a suspicious TLD")

            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if creation_date:
                    creation_date = (
                        creation_date[0]
                        if isinstance(creation_date, list)
                        else creation_date
                    )
                    age_days = (datetime.datetime.now() - creation_date).days
                    if age_days < 180:
                        score += 30
                        reasons.append(f"Domain is very new ({age_days} days old)")
            except Exception:
                score += 10
                reasons.append("WHOIS lookup failed")

        except Exception:
            return {
                "url": url,
                "risk_level": "High",
                "risk_score": 100,
                "reasons": ["URL is malformed"],
            }

        risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
        return {
            "url": url,
            "risk_level": risk_level,
            "risk_score": min(score, 100),
            "reasons": reasons,
        }

    def _check_qr_code(self, image_path):
        try:
            img = cv2.imread(image_path)
            if img is None:
                return None
            qr_detector = cv2.QRCodeDetector()
            data, _, _ = qr_detector.detectAndDecode(img)
            if data:
                return {
                    "qr_data": data,
                    "analysis": self._check_url_risk(data)
                    if data.startswith("http")
                    else {"risk_level": "Low", "reasons": ["QR contains non-URL text"]},
                }
        except Exception:
            return None
        return None

    def _analyze_image_content(self, image_path):
        results = {
            "ocr_text": "",
            "text_scam_prediction": "N/A",
            "qr_analysis": self._check_qr_code(image_path),
        }
        try:
            extracted_text = " ".join(
                self.ocr_reader.readtext(image_path, detail=0, paragraph=True)
            )
            if extracted_text.strip():
                results["ocr_text"] = extracted_text
                vec = self.vectorizer.transform([extracted_text])
                results["text_scam_prediction"] = self.model.predict(vec)[0]
        except Exception:
            pass
        return results


    def _verify_sebi_id(self, sebi_id):
        if not sebi_id or not sebi_id.strip():
            return {"sebi_id": "Not Provided", "status": "N/A"}
        print(f"⚠️  Running placeholder for SEBI ID: {sebi_id}")
        return {
            "sebi_id": sebi_id,
            "status": "Verified (Dummy)"
            if len(sebi_id) > 10 and "INA" in sebi_id
            else "Not Found (Dummy)",
        }

    def _analyze_stock_mentions(self, text):
        stock_symbols = re.findall(r"\b[A-Z]{3,}\b", text)
        if not stock_symbols:
            return None
        suspicious_patterns = (
            ["Message contains pump-and-dump keywords."]
            if any(kw in text.lower() for kw in ["guaranteed", "10x", "insider"])
            else []
        )
        return {
            "mentioned_stocks": list(set(stock_symbols)),
            "suspicious_patterns": suspicious_patterns,
        }

    def _parse_chat_for_messages(self, chat_file_path, user_filter=""):
        messages = []
        # Updated regex for your dataset format
        pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}[+]\d{2}:\d{2}) - (.*?): (.*)')
    
        with open(chat_file_path, "r", encoding="utf-8") as f:
            for line in f:
                match = pattern.match(line.strip())
                if match:
                    timestamp, user, message = match.groups()
                    if user_filter == "" or user_filter.lower() in user.lower():
                        messages.append({
                            "date": timestamp,
                            "user": user,
                            "message": message
                        })
        return messages

    def _detect_bot_behavior(self, file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        users = [
            re.match(r".*?-\s(.*?):", line).group(1).strip()
            for line in lines
            if re.match(r".*?-\s(.*?):", line)
        ]
        if not users:
            return {"error": "No users found."}
        user_counts = Counter(users)
        admin_name = user_counts.most_common(1)[0][0]
        return {"detected_admin": admin_name, "message_counts": dict(user_counts)}

    def _get_bse_bulk_deals(self):
        try:
            url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            for table in pd.read_html(StringIO(response.text)):
                if "Client Name" in table.columns:
                    table.columns = table.columns.str.strip()
                    table["Deal Date"] = pd.to_datetime(
                        table["Deal Date"], format="%d/%m/%Y"
                    )
                    return table
        except Exception:
            return pd.DataFrame()
        return pd.DataFrame()

    def _cross_correlation_engine(self, chat_file, advisor_name, trades_df):
        if trades_df.empty or not advisor_name:
            return ["Bulk deal data not available or no advisor name provided."]

        advisor_trades = trades_df[
            trades_df["Client Name"].str.contains(advisor_name, case=False, na=False)
        ]
        if advisor_trades.empty:
            return [f"No bulk deals found for '{advisor_name}'."]

        advisor_mentions = self._parse_chat_for_messages(chat_file, user_filter=advisor_name)
        if not advisor_mentions:
            return [f"No stock mentions by '{advisor_name}' found in chat."]

        red_flags = []
        for mention in advisor_mentions:
            for symbol in mention["stock_symbols"]:
                for _, trade in advisor_trades.iterrows():
                    if symbol.lower() in trade["Security Name"].lower():
                        time_delta = mention["date"] - trade["Deal Date"]
                        if trade["Deal Type"].lower() == "buy" and 0 <= time_delta.days <= 7:
                            red_flags.append(
                                f"🚨 POTENTIAL FRONT-RUNNING: Advisor promoted '{symbol}' on {mention['date'].date()} just {time_delta.days} day(s) AFTER a bulk BUY."
                            )
                        if trade["Deal Type"].lower() == "sell" and 0 <= time_delta.days <= 30:
                            red_flags.append(
                                f"🚨 POTENTIAL PUMP & DUMP: Advisor made a bulk SELL of {trade['Security Name']} {time_delta.days} day(s) AFTER promoting '{symbol}'."
                            )
        return red_flags if red_flags else ["No suspicious correlations found."]

    def _check_insider_trading(self, trades_df, insider_list, scam_messages):
        if trades_df.empty:
            return ["No bulk deal data available."]
        
        red_flags = []
        for _, trade in trades_df.iterrows():
            client = trade["Client Name"].lower()
            for insider in insider_list:
                if insider.lower() in client:
                    # Check overlap with scam messages
                    for msg in scam_messages:
                        for stock in msg.get("analysis", {}).get("stock_mention_analysis", {}).get("mentioned_stocks", []):
                            if stock.lower() in trade["Security Name"].lower():
                                red_flags.append(
                                    f"🚨 Insider {trade['Client Name']} traded {trade['Security Name']} "
                                    f"({trade['Deal Type']}) while scam messages circulated."
                                )
        return red_flags if red_flags else ["No insider red flags detected."]

        from transformers import pipeline
    import yfinance as yf

    def _sentiment_price_check(self, scam_messages):
        sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
        alerts = []
        for msg in scam_messages:
            text = msg["message"]
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if not stock_info:
                continue
            
            sentiment = sentiment_analyzer(text[:250])[0]  # truncate long texts
            label = sentiment["label"].lower()
            for stock in stock_info["mentioned_stocks"]:
                try:
                    data = yf.download(stock + ".NS", period="5d", interval="1d")
                    if not data.empty:
                        change = data["Close"].iloc[-1] - data["Close"].iloc[-2]
                        if "positive" in label and change < 0:
                            alerts.append(f"🚨 Positive sentiment for {stock} but price fell → suspicious hype.")
                        if "negative" in label and change > 0:
                            alerts.append(f"🚨 Negative sentiment for {stock} but price rose → suspicious suppression.")
                except Exception:
                    continue
        return alerts if alerts else ["No suspicious sentiment/price mismatches."]


    def _social_trading_correlation(self, stock_symbols):
        # 🔎 Mock implementation — replace with Twitter/Telegram APIs
        trending_stocks = {"XYZ": ["#XYZstock trending on Twitter"], "ABC": ["Telegram hype detected"]}
        
        alerts = []
        for stock in stock_symbols:
            if stock in trending_stocks:
                alerts.append(
                    f"🚨 Coordinated manipulation risk: {stock} is trending online AND in bulk deals."
                )
        return alerts if alerts else ["No social + trading manipulation risk found."]
        

    def _portfolio_risk_alerts(self, holdings, scam_messages, trades_df):
        alerts = []
        for stock in holdings:
            for msg in scam_messages:
                stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
                if stock_info and stock in stock_info["mentioned_stocks"]:
                    alerts.append(f"🚨 Scam messages detected for your holding: {stock}.")
            if not trades_df.empty and trades_df["Security Name"].str.contains(stock, case=False).any():
                alerts.append(f"🚨 Bulk deals detected for your holding: {stock}.")
        return alerts if alerts else ["No risks detected for portfolio."]

    
    
    def run(self, chat_file_path, image_folder_path, sebi_id_to_check, entity_name_to_track):
        """Run the full fraud detection pipeline."""
        print("🚀 Starting Full Analysis Pipeline...")
        final_report = defaultdict(dict)

        # 1. Chat-level + market analysis
        print("📈 Analyzing market and chat-level data...")
        final_report["manual_verifications"]["sebi_id_analysis"] = self._verify_sebi_id(sebi_id_to_check)
        bulk_deals_df = self._get_bse_bulk_deals()
        final_report["bulk_deal_analysis"] = {"trades_found_today": len(bulk_deals_df)}
        final_report["cross_correlation_analysis"] = self._cross_correlation_engine(
            chat_file_path, entity_name_to_track, bulk_deals_df
        )
        final_report["chat_analysis"] = self._detect_bot_behavior(chat_file_path)

        # 2. Message-by-message + image analysis
        print("🔬 Analyzing individual messages and images...")
        all_messages = self._parse_chat_for_messages(chat_file_path, user_filter="")  # all users
        scam_predictions = []
        final_report["message_by_message_analysis"] = []

        for msg_data in all_messages:
            text = msg_data["message"]
            analysis = {
                "text_scam_prediction": self.model.predict(self.vectorizer.transform([text]))[0]
            }
            scam_predictions.append(analysis["text_scam_prediction"])

            if re.search(r"(https?://\S+)", text):
                analysis["url_analysis"] = [
                    self._check_url_risk(url) for url in re.findall(r"(https?://\S+)", text)
                ]

            stock_analysis = self._analyze_stock_mentions(text)
            if stock_analysis:
                analysis["stock_mention_analysis"] = stock_analysis

            final_report["message_by_message_analysis"].append(
                {
                    "date": msg_data["date"],
                    "user": msg_data["user"],
                    "message": text,
                    "analysis": analysis,
                }
            )

        # 3. Image analysis
        image_files = (
            glob.glob(os.path.join(image_folder_path, "*.png"))
            + glob.glob(os.path.join(image_folder_path, "*.jpg"))
            + glob.glob(os.path.join(image_folder_path, "*.jpeg"))
        )
        final_report["insider_trading_flags"] = self._check_insider_trading(
            bulk_deals_df, insider_list=["Promoter XYZ", "Director ABC"], 
            scam_messages=final_report["message_by_message_analysis"]
        )

        final_report["sentiment_price_flags"] = self._sentiment_price_check(
            final_report["message_by_message_analysis"]
        )


        mentioned_stocks = set()
        for msg in final_report["message_by_message_analysis"]:
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if stock_info:
                mentioned_stocks.update(stock_info["mentioned_stocks"])
        
        final_report["social_trading_flags"] = self._social_trading_correlation(list(mentioned_stocks))

        user_holdings = ["XYZ", "ABC"]  # Example
        final_report["portfolio_alerts"] = self._portfolio_risk_alerts(
            user_holdings,
            final_report["message_by_message_analysis"],
            bulk_deals_df
        )

        final_report["image_analysis"] = []
        for img_path in image_files:
            final_report["image_analysis"].append(
                {
                    "file_name": os.path.basename(img_path),
                    "analysis": self._analyze_image_content(img_path),
                }
            )

        # 4. Summary
        final_report["summary"] = {
            "total_messages_analyzed": len(all_messages),
            "total_images_analyzed": len(image_files),
            "scam_types_detected": dict(
                Counter(p for p in scam_predictions if p != "not_scam")
            ),
        }

        print("\n✅ Pipeline finished successfully!")
        return final_report


In [None]:
WORKING_DIR = "/kaggle/working/"
VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")

# Input data paths
chat_file = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/chat/Extracted messages....txt"
image_folder = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/images"

# Entity information
sebi_id_input = "INZ000048660"
entity_name_input = "NEO APEX VENTURE LLP"



try:
  
    fraud_detector = FraudDetectionPipeline(model_path=MODEL_PATH, vectorizer_path=VECTORIZER_PATH)
    
   
    if os.path.exists(chat_file):
        results = fraud_detector.run(
            chat_file_path=chat_file,
            image_folder_path=image_folder,
            sebi_id_to_check=sebi_id_input,
            entity_name_to_track=entity_name_input
        )
        
        # 3. Print the final report.
        print("\n" + "="*50)
        print("                 FINAL FRAUD DETECTION REPORT")
        print("="*50 + "\n")
        print(json.dumps(results, indent=4, default=str))
    else:
        print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")

except IOError as e:
    print(e)
    print("Please run the 'train_scam_classifier()' function first to generate model files.")

In [12]:
import pandas as pd

stock_df = pd.read_csv("/kaggle/input/nse-stocks/EQUITY_L.csv")  # Replace with your CSV path

# Standardize columns
stock_df.columns = [col.strip() for col in stock_df.columns]

# Create sets for fast lookup
ticker_set = set(stock_df['SYMBOL'].str.upper())
company_name_map = dict(zip(stock_df['NAME OF COMPANY'].str.lower(), stock_df['SYMBOL'].str.upper()))

In [14]:
import pandas as pd

insider_df = pd.read_csv("/kaggle/input/insider-trading/CF-Insider-Trading-equities-20-Sep-2025.csv")  # your downloaded file

# Standardize column names
insider_df.columns = [col.strip() for col in insider_df.columns]

# Convert date columns to datetime
insider_df['DATE OF ALLOTMENT/ACQUISITION FROM'] = pd.to_datetime(insider_df['DATE OF ALLOTMENT/ACQUISITION FROM'], errors='coerce')

In [27]:
import os
import re
import json
import joblib
import whois
import cv2
import easyocr
import requests
import datetime
import pandas as pd
from io import StringIO
from urllib.parse import urlparse
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import glob
from transformers import pipeline
import yfinance as yf

class FraudDetectionPipeline:
    """
    Multi-modal scam detection pipeline.
    """

    def __init__(self, model_path, vectorizer_path, stock_csv_path):
        print("🚀 Initializing Fraud Detection Pipeline...")
        try:
            self.model = joblib.load(model_path)
            self.vectorizer = joblib.load(vectorizer_path)
            self.ocr_reader = easyocr.Reader(['en'], gpu=False)
            print("✅ Pipeline initialized successfully.")
        except Exception as e:
            raise IOError(f"❌ Failed to load models. Error: {e}")

        # Load stock list
        stock_df = pd.read_csv(stock_csv_path)
        self.ticker_set = set(stock_df["SYMBOL"].str.upper().tolist())
        self.company_name_map = dict(zip(
            stock_df["NAME OF COMPANY"].str.lower(),
            stock_df["SYMBOL"].str.upper()
        ))

    # --- URL Risk ---
    def _check_url_risk(self, url):
        score, reasons = 0, []
        try:
            domain = urlparse(url).netloc
            if not url.startswith("https"): score += 30; reasons.append("URL is not secure (HTTP)")
            if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain): score += 40; reasons.append("URL uses an IP address")
            if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]): score += 25; reasons.append("Suspicious TLD")
            try:
                info = whois.whois(domain)
                cd = info.creation_date
                if cd:
                    cd = cd[0] if isinstance(cd, list) else cd
                    if (datetime.datetime.now() - cd).days < 180:
                        score += 30
                        reasons.append(f"Domain is very new ({(datetime.datetime.now() - cd).days} days)")
            except: score += 10; reasons.append("WHOIS lookup failed")
        except: return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL malformed"]}
        level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
        return {"url": url, "risk_level": level, "risk_score": min(score, 100), "reasons": reasons}

    # --- QR Code ---
    def _check_qr_code(self, image_path):
        try:
            img = cv2.imread(image_path)
            if img is None: return None
            qr = cv2.QRCodeDetector()
            data, _, _ = qr.detectAndDecode(img)
            if data:
                return {
                    "qr_data": data,
                    "analysis": self._check_url_risk(data) if data.startswith("http") else {"risk_level":"Low","reasons":["Non-URL text"]}
                }
        except: return None
        return None

    # --- Image Content Analysis ---
    def _analyze_image_content(self, image_path):
        results = {"ocr_text": "", "text_scam_prediction":"N/A", "qr_analysis": self._check_qr_code(image_path)}
        try:
            text = " ".join(self.ocr_reader.readtext(image_path, detail=0, paragraph=True))
            if text.strip():
                results["ocr_text"] = text
                results["text_scam_prediction"] = self.model.predict(self.vectorizer.transform([text]))[0]
        except: pass
        return results

    # --- SEBI ID ---
    def _verify_sebi_id(self, sebi_id):
        if not sebi_id or not sebi_id.strip(): return {"sebi_id":"Not Provided","status":"N/A"}
        return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id)>10 and "INA" in sebi_id else "Not Found (Dummy)"}

    # --- Stock Detection ---
    def detect_stocks_in_text(self, text):
        found, text_lower = set(), text.lower()
        for name, symbol in self.company_name_map.items():
            if name in text_lower: found.add(symbol)
        for w in re.findall(r'\b\w+\b', text):
            if w.upper() in self.ticker_set: found.add(w.upper())
        return list(found)

    def _analyze_stock_mentions(self, text):
        stocks = self.detect_stocks_in_text(text)
        if not stocks: return None
        suspicious = ["Message contains pump-and-dump keywords."] if any(k in text.lower() for k in ["guaranteed","10x","insider"]) else []
        return {"mentioned_stocks": stocks, "suspicious_patterns": suspicious}

    # --- Chat Parsing ---
    def _parse_chat_for_messages(self, chat_file_path, user_filter=""):
        msgs, pattern = [], re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:[+]\d{2}:\d{2})?) - (.*?): (.*)')
        with open(chat_file_path,"r",encoding="utf-8") as f:
            for line in f:
                m = pattern.match(line.strip())
                if m:
                    ts, user, msg = m.groups()
                    if user_filter=="" or user_filter.lower() in user.lower():
                        msgs.append({"date":ts,"user":user,"message":msg})
        return msgs

    # --- Bot Detection ---
    def _detect_bot_behavior(self, file_path):
        with open(file_path,"r",encoding="utf-8") as f: lines=f.readlines()
        users=[re.match(r".*?-\s(.*?):",l).group(1).strip() for l in lines if re.match(r".*?-\s(.*?):",l)]
        if not users: return {"error":"No users found."}
        counts=Counter(users)
        return {"detected_admin":counts.most_common(1)[0][0], "message_counts":dict(counts)}

    # --- BSE Bulk Deals ---
    def _get_bse_bulk_deals(self):
        try:
            url="https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
            headers={"User-Agent":"Mozilla/5.0"}
            resp=requests.get(url, headers=headers); resp.raise_for_status()
            for table in pd.read_html(StringIO(resp.text)):
                if "Client Name" in table.columns:
                    table.columns=table.columns.str.strip()
                    table["Deal Date"]=pd.to_datetime(table["Deal Date"], format="%d/%m/%Y")
                    return table
        except: return pd.DataFrame()
        return pd.DataFrame()

    # --- Insider Trading ---
    def _check_insider_trading(self, scam_messages, insider_df, lookback_days=7):
        if insider_df.empty: return ["No insider red flags detected."]
        alerts=[]
        for msg in scam_messages:
            stock_info = msg.get("analysis",{}).get("stock_mention_analysis")
            if not stock_info: continue
            for stock in stock_info["mentioned_stocks"]:
                if 'SYMBOL' in insider_df.columns:
                    relevant = insider_df[
                        (insider_df['SYMBOL'].str.upper()==stock.upper()) &
                        (pd.to_datetime(insider_df['DATE OF ALLOTMENT/ACQUISITION FROM']) >= pd.Timestamp.now()-pd.Timedelta(days=lookback_days))
                    ]
                    for _,trade in relevant.iterrows():
                        alerts.append(f"🚨 Insider Alert: {trade.get('NAME OF THE ACQUIRER/DISPOSER','Unknown')} ({trade.get('CATEGORY OF PERSON','')}) traded {trade.get('SYMBOL','')} ({trade.get('ACQUISITION/DISPOSAL TRANSACTION TYPE','')}) on {pd.to_datetime(trade['DATE OF ALLOTMENT/ACQUISITION FROM']).date()} while scam messages mentioning this stock were circulating.")
        return alerts if alerts else ["No insider red flags detected."]

    # --- Sentiment vs Price ---
    def _sentiment_price_check(self, scam_messages):
        analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
        alerts=[]
        all_stocks=set()
        for msg in scam_messages:
            si=msg.get("analysis",{}).get("stock_mention_analysis")
            if si: all_stocks.update(si["mentioned_stocks"])
        if not all_stocks: return ["No suspicious sentiment/price mismatches."]
        tickers=[s+".NS" for s in all_stocks]
        try: price_data=yf.download(tickers, period="5d", interval="1d", group_by='ticker', progress=False)
        except: price_data=pd.DataFrame()
        for msg in scam_messages:
            si=msg.get("analysis",{}).get("stock_mention_analysis")
            if not si: continue
            sentiment = analyzer(msg["message"][:250])[0]["label"].lower()
            for stock in si["mentioned_stocks"]:
                try:
                    if stock+".NS" in price_data:
                        df=price_data[stock+".NS"]
                        if not df.empty and len(df["Close"])>1:
                            change = df["Close"].iloc[-1]-df["Close"].iloc[-2]
                            if "positive" in sentiment and change<0: alerts.append(f"🚨 Positive sentiment for {stock} but price fell → suspicious hype.")
                            if "negative" in sentiment and change>0: alerts.append(f"🚨 Negative sentiment for {stock} but price rose → suspicious suppression.")
                except: continue
        return alerts if alerts else ["No suspicious sentiment/price mismatches."]

    # --- Run Pipeline ---
    def run(self, chat_file_path, image_folder_path, sebi_id_to_check, entity_name_to_track, insider_csv_path=None):
        print("🚀 Starting Full Analysis Pipeline...")
        final_report = defaultdict(dict)
        final_report["manual_verifications"] = {}

        # Chat & market
        final_report["manual_verifications"]["sebi_id_analysis"]=self._verify_sebi_id(sebi_id_to_check)
        bulk_df=self._get_bse_bulk_deals()
        final_report["bulk_deal_analysis"]={"trades_found_today":len(bulk_df)}

        # Messages
        msgs=self._parse_chat_for_messages(chat_file_path)
        final_report["message_by_message_analysis"]=[]
        scam_preds=[]
        for msg in msgs:
            text=msg["message"]
            analysis={"text_scam_prediction": self.model.predict(self.vectorizer.transform([text]))[0]}
            scam_preds.append(analysis["text_scam_prediction"])
            urls=re.findall(r"(https?://\S+)", text)
            if urls: analysis["url_analysis"]=[self._check_url_risk(u) for u in urls]
            stock_analysis=self._analyze_stock_mentions(text)
            if stock_analysis: analysis["stock_mention_analysis"]=stock_analysis
            final_report["message_by_message_analysis"].append({"date":msg["date"],"user":msg["user"],"message":text,"analysis":analysis})

        # Insider trading
        insider_df=pd.read_csv(insider_csv_path) if insider_csv_path and os.path.exists(insider_csv_path) else pd.DataFrame()
        required_cols=['SYMBOL','DATE OF ALLOTMENT/ACQUISITION FROM','NAME OF THE ACQUIRER/DISPOSER','CATEGORY OF PERSON','ACQUISITION/DISPOSAL TRANSACTION TYPE']
        for col in required_cols:
            if col not in insider_df.columns: insider_df[col]=None
        final_report["insider_trading_flags"]=self._check_insider_trading(final_report["message_by_message_analysis"], insider_df)

        # Sentiment
        final_report["sentiment_price_flags"]=self._sentiment_price_check(final_report["message_by_message_analysis"])

        print("\n✅ Pipeline finished successfully!")
        return final_report


In [16]:
import os
import re
import glob
import joblib
import datetime
import pandas as pd
from collections import defaultdict, Counter
from urllib.parse import urlparse
import whois
import cv2
import easyocr
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import yfinance as yf

class FraudDetectionPipeline:
    """
    Enhanced multi-modal fraud detection pipeline with:
    - Chat analysis
    - Stock mention detection
    - Stock prediction verification (optimized)
    - URL and QR risk detection
    - Image OCR analysis
    - Insider trading alerts
    - Sentiment vs price check
    """

    def __init__(self, model_path, vectorizer_path, stock_csv_path):
        print("🚀 Initializing Fraud Detection Pipeline...")
        try:
            self.model = joblib.load(model_path)
            self.vectorizer = joblib.load(vectorizer_path)
            self.ocr_reader = easyocr.Reader(['en'], gpu=False)
            print("✅ Pipeline initialized successfully.")
        except Exception as e:
            raise IOError(f"❌ Failed to load models. Error: {e}")

        # Load stock symbols and company mapping
        stock_df = pd.read_csv(stock_csv_path)
        self.ticker_set = set(stock_df["SYMBOL"].str.upper().tolist())
        self.company_name_map = dict(zip(
            stock_df["NAME OF COMPANY"].str.lower(),
            stock_df["SYMBOL"].str.upper()
        ))

    # ------------------ URL Risk Analysis ------------------
    def _check_url_risk(self, url):
        score, reasons = 0, []
        try:
            domain = urlparse(url).netloc
            if not url.startswith("https"):
                score += 30
                reasons.append("URL is not secure (HTTP)")
            if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain):
                score += 40
                reasons.append("URL uses an IP address")
            if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]):
                score += 25
                reasons.append("URL uses a suspicious TLD")
            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if creation_date:
                    creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                    age_days = (datetime.datetime.now() - creation_date).days
                    if age_days < 180:
                        score += 30
                        reasons.append(f"Domain is very new ({age_days} days old)")
            except Exception:
                score += 10
                reasons.append("WHOIS lookup failed")
        except Exception:
            return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
        risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
        return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

    # ------------------ QR Code Analysis ------------------
    def _check_qr_code(self, image_path):
        try:
            img = cv2.imread(image_path)
            if img is None:
                return None
            qr_detector = cv2.QRCodeDetector()
            data, _, _ = qr_detector.detectAndDecode(img)
            if data:
                return {
                    "qr_data": data,
                    "analysis": self._check_url_risk(data) if data.startswith("http") else {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}
                }
        except Exception:
            return None
        return None

    # ------------------ Image Content Analysis ------------------
    def _analyze_image_content(self, image_path):
        results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": self._check_qr_code(image_path)}
        try:
            extracted_text = " ".join(self.ocr_reader.readtext(image_path, detail=0, paragraph=True))
            if extracted_text.strip():
                results["ocr_text"] = extracted_text
                vec = self.vectorizer.transform([extracted_text])
                results["text_scam_prediction"] = self.model.predict(vec)[0]
        except Exception:
            pass
        return results

    # ------------------ SEBI ID Verification ------------------
    def _verify_sebi_id(self, sebi_id):
        if not sebi_id or not sebi_id.strip():
            return {"sebi_id": "Not Provided", "status": "N/A"}
        return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 10 and "INA" in sebi_id else "Not Found (Dummy)"}

    # ------------------ Stock Detection in Text ------------------
    def detect_stocks_in_text(self, text):
        found_stocks = set()
        text_lower = text.lower()
        for name, symbol in self.company_name_map.items():
            if name in text_lower:
                found_stocks.add(symbol)
        words = re.findall(r'\b\w+\b', text)
        for word in words:
            if word.upper() in self.ticker_set:
                found_stocks.add(word.upper())
        return list(found_stocks)

    def _analyze_stock_mentions(self, text):
        detected_stocks = self.detect_stocks_in_text(text)
        if not detected_stocks:
            return None
        suspicious_patterns = []
        for kw in ["guaranteed", "10x", "insider"]:
            if kw.lower() in text.lower():
                suspicious_patterns.append("Message contains pump-and-dump keywords.")
                break
        return {"mentioned_stocks": detected_stocks, "suspicious_patterns": suspicious_patterns}

    # ------------------ Stock Prediction Verification ------------------
    def _verify_stock_predictions_batch(self, messages, horizon_days=3):
        """
        Batch verification for all stocks mentioned in messages.
        """
        # 1. Collect all unique stocks
        all_stocks = set()
        for msg in messages:
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if stock_info:
                all_stocks.update(stock_info["mentioned_stocks"])

        if not all_stocks:
            return {}

        # 2. Fetch Yahoo Finance data in one go per stock
        yf_data = {}
        for stock in all_stocks:
            try:
                ticker = stock + ".NS"
                min_date = min(pd.to_datetime(msg["date"]) for msg in messages)
                max_date = max(pd.to_datetime(msg["date"]) + pd.Timedelta(days=horizon_days) for msg in messages)
                yf_data[stock] = yf.download(ticker, start=min_date.date(), end=max_date.date(), progress=False)
            except Exception:
                yf_data[stock] = pd.DataFrame()

        # 3. Verify each message prediction
        # 3. Verify each message prediction
        results = {}
        for msg in messages:
            text = msg["message"]
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if not stock_info:
                continue
        
            verifications = []
            for stock in stock_info["mentioned_stocks"]:
                df = yf_data.get(stock)
                if df is None or df.empty:
                    continue
        
                # Handle multi-index or single ticker DataFrame
                if isinstance(df.columns, pd.MultiIndex):
                    # Multi-ticker DataFrame: pick the 'Close' column for the specific stock
                    try:
                        close_prices = df["Close"][stock + ".NS"]
                    except KeyError:
                        continue
                else:
                    # Single-ticker DataFrame
                    close_prices = df["Close"]
        
                if len(close_prices) > 1:
                    change = close_prices.iloc[-1] - close_prices.iloc[0]
                    actual_direction = "UP" if change > 0 else "DOWN"
                    predicted_up = bool(re.search(r'\b(up|increase|gain|rally|soar|\+)\b', text, re.IGNORECASE))
                    predicted_down = bool(re.search(r'\b(down|fall|drop|loss|-)\b', text, re.IGNORECASE))
                    match = (predicted_up and change > 0) or (predicted_down and change < 0)
        
                    verifications.append({
                        "stock": stock,
                        "predicted_text": text,
                        "actual_change": change,
                        "actual_direction": actual_direction,
                        "prediction_correct": match
                    })
        
            if verifications:
                results[msg["date"]] = verifications
        
        return results


    # ------------------ Chat Parsing ------------------
    def _parse_chat_for_messages(self, chat_file_path, user_filter=""):
        messages = []
        pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:[+]\d{2}:\d{2})?) - (.*?): (.*)')
        with open(chat_file_path, "r", encoding="utf-8") as f:
            for line in f:
                match = pattern.match(line.strip())
                if match:
                    timestamp, user, message = match.groups()
                    if user_filter == "" or user_filter.lower() in user.lower():
                        messages.append({"date": timestamp, "user": user, "message": message})
        return messages

    # ------------------ BSE Bulk Deals ------------------
    def _get_bse_bulk_deals(self):
        try:
            url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            for table in pd.read_html(StringIO(response.text)):
                if "Client Name" in table.columns:
                    table.columns = table.columns.str.strip()
                    table["Deal Date"] = pd.to_datetime(table["Deal Date"], format="%d/%m/%Y")
                    return table
        except Exception:
            return pd.DataFrame()
        return pd.DataFrame()

    # ------------------ Insider Trading ------------------
    def _check_insider_trading(self, scam_messages, insider_df, lookback_days=7):
        alerts = []
        for msg in scam_messages:
            text = msg['message']
            stock_info = msg.get('analysis', {}).get('stock_mention_analysis')
            if not stock_info:
                continue
            for stock in stock_info['mentioned_stocks']:
                relevant_trades = insider_df[
                    (insider_df['SYMBOL'].str.upper() == stock.upper()) &
                    (pd.to_datetime(insider_df['DATE OF ALLOTMENT/ACQUISITION FROM']) >= pd.Timestamp.now() - pd.Timedelta(days=lookback_days))
                ]
                for _, trade in relevant_trades.iterrows():
                    alerts.append(
                        f"🚨 Insider Alert: {trade['NAME OF THE ACQUIRER/DISPOSER']} "
                        f"({trade['CATEGORY OF PERSON']}) traded {trade['SYMBOL']} "
                        f"({trade['ACQUISITION/DISPOSAL TRANSACTION TYPE']}) on "
                        f"{pd.to_datetime(trade['DATE OF ALLOTMENT/ACQUISITION FROM']).date()} "
                        f"while scam messages mentioning this stock were circulating."
                    )
        return alerts if alerts else ["No insider red flags detected."]

    # ------------------ Sentiment vs Price ------------------
    def _sentiment_price_check(self, scam_messages):
        sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
        alerts = []
        all_stocks = set()
        for msg in scam_messages:
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if stock_info:
                all_stocks.update(stock_info["mentioned_stocks"])
        if not all_stocks:
            return ["No suspicious sentiment/price mismatches."]
        tickers = [s + ".NS" for s in all_stocks]
        try:
            price_data = yf.download(tickers, period="5d", interval="1d", group_by='ticker', progress=False)
        except Exception:
            price_data = pd.DataFrame()
        for msg in scam_messages:
            text = msg["message"]
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if not stock_info:
                continue
            sentiment = sentiment_analyzer(text[:250])[0]
            label = sentiment["label"].lower()
            for stock in stock_info["mentioned_stocks"]:
                try:
                    if stock + ".NS" in price_data:
                        df = price_data[stock + ".NS"]
                        if not df.empty and len(df["Close"]) > 1:
                            change = df["Close"].iloc[-1] - df["Close"].iloc[-2]
                            if "positive" in label and change < 0:
                                alerts.append(f"🚨 Positive sentiment for {stock} but price fell → suspicious hype.")
                            if "negative" in label and change > 0:
                                alerts.append(f"🚨 Negative sentiment for {stock} but price rose → suspicious suppression.")
                except Exception:
                    continue
        return alerts if alerts else ["No suspicious sentiment/price mismatches."]

    # ------------------ Full Pipeline ------------------
    def run(self, chat_file_path, image_folder_path, sebi_id_to_check, insider_csv_path=None):
        print("🚀 Starting Full Analysis Pipeline...")
        final_report = defaultdict(dict)
        final_report["manual_verifications"] = {}

        # SEBI verification
        final_report["manual_verifications"]["sebi_id_analysis"] = self._verify_sebi_id(sebi_id_to_check)

        # Bulk deals
        bulk_deals_df = self._get_bse_bulk_deals()
        final_report["bulk_deal_analysis"] = {"trades_found_today": len(bulk_deals_df)}

        # Messages
        all_messages = self._parse_chat_for_messages(chat_file_path)
        final_report["message_by_message_analysis"] = []

        for msg_data in all_messages:
            text = msg_data["message"]
            analysis = {"text_scam_prediction": self.model.predict(self.vectorizer.transform([text]))[0]}

            # URL analysis
            urls = re.findall(r"(https?://\S+)", text)
            if urls:
                analysis["url_analysis"] = [self._check_url_risk(url) for url in urls]

            # Stock mentions
            stock_analysis = self._analyze_stock_mentions(text)
            if stock_analysis:
                analysis["stock_mention_analysis"] = stock_analysis

            final_report["message_by_message_analysis"].append({
                "date": msg_data["date"],
                "user": msg_data["user"],
                "message": text,
                "analysis": analysis
            })

        # Stock prediction verification (batched)
        predictions = self._verify_stock_predictions_batch(final_report["message_by_message_analysis"])
        for msg in final_report["message_by_message_analysis"]:
            msg["analysis"]["stock_prediction_verification"] = predictions.get(msg["date"], [])

        # Insider trading
        if insider_csv_path and os.path.exists(insider_csv_path):
            insider_df = pd.read_csv(insider_csv_path)
        else:
            insider_df = pd.DataFrame()
        required_cols = ['SYMBOL', 'DATE OF ALLOTMENT/ACQUISITION FROM', 'NAME OF THE ACQUIRER/DISPOSER',
                         'CATEGORY OF PERSON', 'ACQUISITION/DISPOSAL TRANSACTION TYPE']
        for col in required_cols:
            if col not in insider_df.columns:
                insider_df[col] = None
        final_report["insider_trading_flags"] = self._check_insider_trading(final_report["message_by_message_analysis"], insider_df)

        # Sentiment vs price
        final_report["sentiment_price_flags"] = self._sentiment_price_check(final_report["message_by_message_analysis"])

        # Image OCR + QR analysis
        image_files = glob.glob(os.path.join(image_folder_path, "*"))
        final_report["image_analysis"] = []
        for img_path in image_files:
            img_report = self._analyze_image_content(img_path)
            final_report["image_analysis"].append({
                "file_name": os.path.basename(img_path),
                "analysis": img_report
            })

        print("✅ Pipeline finished successfully!")
        return final_report


In [None]:
import os
from collections import Counter

WORKING_DIR = "/kaggle/working/"
VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")
STOCK_PATH = "/kaggle/input/nse-stocks/EQUITY_L.csv"

# Input data paths
chat_file = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/chat/Extracted messages....txt"
image_folder = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/images"

# Entity information
sebi_id_input = "INZ000048660"
entity_name_input = "NEO APEX VENTURE LLP"


def print_pipeline_report(report, max_messages=5):
    print("\n" + "="*70)
    print("🚨 FRAUD DETECTION & STOCK VERIFICATION PIPELINE REPORT")
    print("="*70 + "\n")

    # --- SEBI ID Verification ---
    sebi_report = report.get("manual_verifications", {}).get("sebi_id_analysis", {})
    print("🔹 SEBI ID Verification:")
    print(f"   SEBI ID: {sebi_report.get('sebi_id', 'N/A')}")
    print(f"   Status: {sebi_report.get('status', 'N/A')}\n")

    # --- Bulk Deals ---
    bulk_deals_count = report.get("bulk_deal_analysis", {}).get("trades_found_today", 0)
    print(f"🔹 BSE/NSE Bulk Deals Found Today: {bulk_deals_count}\n")

    # --- Message by Message Analysis ---
    print("🔹 Chat Messages Analysis:")
    messages = report.get("message_by_message_analysis", [])
    for msg in messages[:max_messages]:
        print(f"  [{msg['date']}] {msg['user']}: {msg['message']}")
        analysis = msg.get("analysis", {})

        # Scam Prediction
        print(f"     - Scam Prediction: {analysis.get('text_scam_prediction', 'N/A')}")

        # URL analysis
        if "url_analysis" in analysis:
            for url_res in analysis["url_analysis"]:
                print(f"     - URL Risk: {url_res['url']} -> {url_res['risk_level']} ({url_res['risk_score']}%)")
                if url_res.get("reasons"):
                    print(f"       Reasons: {', '.join(url_res['reasons'])}")

        # Stock mentions
        if "stock_mention_analysis" in analysis:
            stocks = analysis["stock_mention_analysis"].get("mentioned_stocks", [])
            print(f"     - Mentioned Stocks: {', '.join(stocks) if stocks else 'None'}")
            if analysis["stock_mention_analysis"].get("suspicious_patterns"):
                print(f"       Suspicious Patterns: {', '.join(analysis['stock_mention_analysis']['suspicious_patterns'])}")

        # Stock prediction verification
        if "stock_prediction_verification" in analysis and analysis["stock_prediction_verification"]:
            print(f"     - Stock Prediction Verification:")
            for ver in analysis["stock_prediction_verification"]:
                # Extract predicted direction from text
                predicted_up = bool(re.search(r'\b(up|increase|gain|rally|soar|\+)\b', ver['predicted_text'], re.IGNORECASE))
                predicted_down = bool(re.search(r'\b(down|fall|drop|loss|-)\b', ver['predicted_text'], re.IGNORECASE))
                predicted_direction = "UP" if predicted_up else "DOWN" if predicted_down else "N/A"
        
                print(f"       * {ver['stock']}: Predicted: {predicted_direction}, "
                      f"Actual: {ver['actual_direction']} ({ver['actual_change']:.2f}) -> Correct: {ver['prediction_correct']}")
        print("")

    if len(messages) > max_messages:
        print(f"  ...and {len(messages) - max_messages} more messages.\n")

    # --- Insider Trading Flags ---
    print("🔹 Insider Trading Alerts:")
    insider_alerts = report.get("insider_trading_flags", [])
    if insider_alerts:
        for alert in insider_alerts:
            print(f"   - {alert}")
    else:
        print("   - None")
    print("")

    # --- Sentiment vs Price Flags ---
    print("🔹 Sentiment vs Price Mismatches:")
    sentiment_alerts = report.get("sentiment_price_flags", [])
    if sentiment_alerts:
        for alert in sentiment_alerts:
            print(f"   - {alert}")
    else:
        print("   - None")
    print("")

    # --- Image Analysis ---
    print("🔹 Image OCR & QR Analysis:")
    for img in report.get("image_analysis", []):
        print(f"   - {img['file_name']}:")
        print(f"       OCR Text Length: {len(img['analysis'].get('ocr_text', ''))}")
        print(f"       Text Scam Prediction: {img['analysis'].get('text_scam_prediction', 'N/A')}")
        qr = img['analysis'].get("qr_analysis")
        if qr:
            print(f"       QR Risk: {qr.get('risk_level', 'N/A')}")
            if qr.get("reasons"):
                print(f"       QR Reasons: {', '.join(qr['reasons'])}")
        print("")

    # --- Summary ---
    print("="*70)
    print("✅ End of Report")
    print("="*70 + "\n")


# Main execution
try:
    fraud_detector = FraudDetectionPipeline(
        model_path=MODEL_PATH, 
        vectorizer_path=VECTORIZER_PATH, 
        stock_csv_path=STOCK_PATH
    )
    
    if os.path.exists(chat_file):
        results = fraud_detector.run(
            chat_file_path=chat_file,
            image_folder_path=image_folder,
            sebi_id_to_check=sebi_id_input,
        )
        print_pipeline_report(results, max_messages=10)
    else:
        print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")

except IOError as e:
    print(e)
    print("Please run the 'train_scam_classifier()' function first to generate model files.")


In [15]:
import os
import re
import glob
import joblib
import datetime
import pandas as pd
from io import StringIO
from collections import defaultdict, Counter
from urllib.parse import urlparse
import whois
import cv2
import easyocr
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import yfinance as yf

class FraudDetectionPipeline:
    """
    Enhanced multi-modal fraud detection pipeline with:
    - Chat analysis
    - Stock mention detection
    - Stock prediction verification (optimized)
    - URL and QR risk detection
    - Image OCR analysis
    - Insider trading alerts
    - Sentiment vs price check
    - User prediction accuracy evaluation
    """

    def __init__(self, model_path, vectorizer_path, stock_csv_path):
        print("🚀 Initializing Fraud Detection Pipeline...")
        try:
            self.model = joblib.load(model_path)
            self.vectorizer = joblib.load(vectorizer_path)
            self.ocr_reader = easyocr.Reader(['en'], gpu=False)
            print("✅ Pipeline initialized successfully.")
        except Exception as e:
            raise IOError(f"❌ Failed to load models. Error: {e}")

        # Load stock symbols and company mapping
        stock_df = pd.read_csv(stock_csv_path)
        self.ticker_set = set(stock_df["SYMBOL"].str.upper().tolist())
        self.company_name_map = dict(zip(
            stock_df["NAME OF COMPANY"].str.lower(),
            stock_df["SYMBOL"].str.upper()
        ))

    # ------------------ URL Risk Analysis ------------------
    def _check_url_risk(self, url):
        score, reasons = 0, []
        try:
            domain = urlparse(url).netloc
            if not url.startswith("https"):
                score += 30
                reasons.append("URL is not secure (HTTP)")
            if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain):
                score += 40
                reasons.append("URL uses an IP address")
            if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]):
                score += 25
                reasons.append("URL uses a suspicious TLD")
            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if creation_date:
                    creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                    age_days = (datetime.datetime.now() - creation_date).days
                    if age_days < 180:
                        score += 30
                        reasons.append(f"Domain is very new ({age_days} days old)")
            except Exception:
                score += 10
                reasons.append("WHOIS lookup failed")
        except Exception:
            return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
        risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
        return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

    # ------------------ QR Code Analysis ------------------
    def _check_qr_code(self, image_path):
        try:
            img = cv2.imread(image_path)
            if img is None:
                return None
            qr_detector = cv2.QRCodeDetector()
            data, _, _ = qr_detector.detectAndDecode(img)
            if data:
                return {
                    "qr_data": data,
                    "analysis": self._check_url_risk(data) if data.startswith("http") else {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}
                }
        except Exception:
            return None
        return None

    # ------------------ Image Content Analysis ------------------
    def _analyze_image_content(self, image_path):
        results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": self._check_qr_code(image_path)}
        try:
            extracted_text = " ".join(self.ocr_reader.readtext(image_path, detail=0, paragraph=True))
            if extracted_text.strip():
                results["ocr_text"] = extracted_text
                vec = self.vectorizer.transform([extracted_text])
                # --- FIX --- Cast to standard string
                prediction = self.model.predict(vec)[0]
                results["text_scam_prediction"] = str(prediction)
        except Exception:
            pass
        return results

    # ------------------ SEBI ID Verification ------------------
    def _verify_sebi_id(self, sebi_id):
        if not sebi_id or not sebi_id.strip():
            return {"sebi_id": "Not Provided", "status": "N/A"}
        return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 10 and "INA" in sebi_id else "Not Found (Dummy)"}

    # ------------------ Stock Detection in Text ------------------
    def detect_stocks_in_text(self, text):
        found_stocks = set()
        text_lower = text.lower()
        for name, symbol in self.company_name_map.items():
            if name in text_lower:
                found_stocks.add(symbol)
        words = re.findall(r'\b\w+\b', text)
        for word in words:
            if word.upper() in self.ticker_set:
                found_stocks.add(word.upper())
        return list(found_stocks)

    def _analyze_stock_mentions(self, text):
        detected_stocks = self.detect_stocks_in_text(text)
        if not detected_stocks:
            return None
        suspicious_patterns = []
        for kw in ["guaranteed", "10x", "insider"]:
            if kw.lower() in text.lower():
                suspicious_patterns.append("Message contains pump-and-dump keywords.")
                break
        return {"mentioned_stocks": detected_stocks, "suspicious_patterns": suspicious_patterns}

    # ------------------ Stock Prediction Verification ------------------
    # ------------------ Stock Prediction Verification ------------------
    def _verify_stock_predictions_batch(self, messages, horizon_days=3):
        """
        Batch verification for all stocks mentioned in messages.
        """
        all_stocks = set()
        for msg in messages:
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if stock_info:
                all_stocks.update(stock_info["mentioned_stocks"])
        if not all_stocks:
            return {}

        yf_data = {}
        for stock in all_stocks:
            try:
                ticker = stock + ".NS"
                min_date = min(pd.to_datetime(msg["date"]) for msg in messages)
                max_date = max(pd.to_datetime(msg["date"]) + pd.Timedelta(days=horizon_days) for msg in messages)
                yf_data[stock] = yf.download(ticker, start=min_date.date(), end=max_date.date(), progress=False)
            except Exception:
                yf_data[stock] = pd.DataFrame()

        results = {}
        for msg in messages:
            text = msg["message"]
            msg_date = msg["date"]
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if not stock_info:
                continue

            verifications = []
            for stock in stock_info["mentioned_stocks"]:
                df = yf_data.get(stock)
                if df is None or df.empty:
                    continue
                
                start_date = pd.to_datetime(msg_date).tz_localize(None)
                end_date = start_date + pd.Timedelta(days=horizon_days)
                relevant_df = df[(df.index >= start_date) & (df.index <= end_date)]

                if len(relevant_df) > 1:
                    change = relevant_df["Close"].iloc[-1] - relevant_df["Close"].iloc[0]
                    actual_direction = "UP" if (change > 0).any() else "DOWN"
                    
                    predicted_up = bool(re.search(r'\b(up|increase|gain|rally|soar|\+)\b', text, re.IGNORECASE))
                    predicted_down = bool(re.search(r'\b(down|fall|drop|loss|-)\b', text, re.IGNORECASE))
                    
                    went_up = (change > 0)
                    if isinstance(went_up, pd.Series):
                        went_up = went_up.any()

                    went_down = (change < 0)
                    if isinstance(went_down, pd.Series):
                        went_down = went_down.any()

                    match = (predicted_up and went_up) or (predicted_down and went_down)

                    verifications.append({
                        "stock": stock,
                        "predicted_text": text,
                        "actual_change": float(change.item() if isinstance(change, pd.Series) else change),
                        "actual_direction": actual_direction,
                        # --- FIX IS HERE ---
                        # Explicitly cast the 'match' variable to a standard Python bool
                        "prediction_correct": bool(match)
                    })
            if verifications:
                results[msg_date] = verifications
        return results

    # ------------------ Chat Parsing ------------------
    def _parse_chat_for_messages(self, chat_file_path, user_filter=""):
        messages = []
        pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:[+]\d{2}:\d{2})?) - (.*?): (.*)')
        with open(chat_file_path, "r", encoding="utf-8") as f:
            for line in f:
                match = pattern.match(line.strip())
                if match:
                    timestamp, user, message = match.groups()
                    if user_filter == "" or user_filter.lower() in user.lower():
                        messages.append({"date": timestamp, "user": user, "message": message})
        return messages

    # ------------------ BSE Bulk Deals ------------------
    def _get_bse_bulk_deals(self):
        try:
            url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            for table in pd.read_html(StringIO(response.text)):
                if "Client Name" in table.columns:
                    table.columns = table.columns.str.strip()
                    table["Deal Date"] = pd.to_datetime(table["Deal Date"], format="%d/%m/%Y")
                    return table
        except Exception:
            return pd.DataFrame()
        return pd.DataFrame()

    # ------------------ Insider Trading ------------------
    def _check_insider_trading(self, scam_messages, insider_df, lookback_days=7):
        alerts = []
        for msg in scam_messages:
            text = msg['message']
            stock_info = msg.get('analysis', {}).get('stock_mention_analysis')
            if not stock_info:
                continue
            for stock in stock_info['mentioned_stocks']:
                relevant_trades = insider_df[
                    (insider_df['SYMBOL'].str.upper() == stock.upper()) &
                    (pd.to_datetime(insider_df['DATE OF ALLOTMENT/ACQUISITION FROM']) >= pd.Timestamp.now() - pd.Timedelta(days=lookback_days))
                ]
                for _, trade in relevant_trades.iterrows():
                    alerts.append(
                        f"🚨 Insider Alert: {trade['NAME OF THE ACQUIRER/DISPOSER']} "
                        f"({trade['CATEGORY OF PERSON']}) traded {trade['SYMBOL']} "
                        f"({trade['ACQUISITION/DISPOSAL TRANSACTION TYPE']}) on "
                        f"{pd.to_datetime(trade['DATE OF ALLOTMENT/ACQUISITION FROM']).date()} "
                        f"while scam messages mentioning this stock were circulating."
                    )
        return alerts if alerts else ["No insider red flags detected."]

    # ------------------ Sentiment vs Price ------------------
    # ------------------ Sentiment vs Price ------------------
    def _sentiment_price_check(self, scam_messages):
        sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
        alerts = []
        all_stocks = set()
        for msg in scam_messages:
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if stock_info:
                all_stocks.update(stock_info["mentioned_stocks"])
        
        if not all_stocks:
            return ["No suspicious sentiment/price mismatches."]
        
        tickers = [s + ".NS" for s in all_stocks]
        try:
            price_data = yf.download(tickers, period="5d", interval="1d", group_by='ticker', progress=False)
        except Exception:
            return ["Could not fetch price data for sentiment check."]

        # --- FIX IS HERE ---
        # Handle the two cases for yfinance's returned DataFrame structure
        is_multi_index = isinstance(price_data.columns, pd.MultiIndex)

        for msg in scam_messages:
            text = msg["message"]
            stock_info = msg.get("analysis", {}).get("stock_mention_analysis")
            if not stock_info:
                continue
            
            sentiment = sentiment_analyzer(text[:250])[0]
            label = sentiment["label"].lower()
            
            for stock in stock_info["mentioned_stocks"]:
                try:
                    df = pd.DataFrame() # Start with an empty DataFrame

                    # Case 1: Multiple stocks were downloaded successfully
                    if is_multi_index:
                        # Check if this stock's data is present before trying to access it
                        if (stock + ".NS") in price_data.columns.get_level_values(1):
                            # Correctly select the data for one ticker from the multi-index
                            df = price_data.xs(stock + ".NS", level=1, axis=1)
                    
                    # Case 2: Only one stock was downloaded successfully
                    elif not price_data.empty:
                        df = price_data

                    # Now, perform the analysis only if we successfully extracted the stock's data
                    if not df.empty and len(df["Close"]) > 1 and not df["Close"].isnull().all():
                        change = df["Close"].iloc[-1] - df["Close"].iloc[-2]
                        if "positive" in label and change < 0:
                            alerts.append(f"🚨 Positive sentiment for {stock} but price fell → suspicious hype.")
                        if "negative" in label and change > 0:
                            alerts.append(f"🚨 Negative sentiment for {stock} but price rose → suspicious suppression.")
                
                except Exception:
                    # Catch any other errors for a specific stock and continue
                    continue
                    
        return alerts if alerts else ["No suspicious sentiment/price mismatches."]

    # ------------------ User Prediction Accuracy Evaluation ------------------
    def _evaluate_user_prediction_accuracy(self, analyzed_messages, min_predictions=3, accuracy_threshold=40.0):
        """
        Analyzes the prediction accuracy of each user and flags potential scammers.
        """
        print("📊 Evaluating user prediction accuracy...")
        user_stats = defaultdict(lambda: {'correct': 0, 'incorrect': 0})

        for msg in analyzed_messages:
            user = msg["user"]
            verifications = msg.get("analysis", {}).get("stock_prediction_verification", [])
            for verification in verifications:
                if verification["prediction_correct"]:
                    user_stats[user]['correct'] += 1
                else:
                    user_stats[user]['incorrect'] += 1
        
        accuracy_report = {}
        for user, stats in user_stats.items():
            total_predictions = stats['correct'] + stats['incorrect']
            if total_predictions == 0:
                continue
            accuracy = (stats['correct'] / total_predictions) * 100
            is_potential_scammer = (total_predictions >= min_predictions and accuracy < accuracy_threshold)
            accuracy_report[user] = {
                "correct_predictions": stats['correct'],
                "incorrect_predictions": stats['incorrect'],
                "total_predictions": total_predictions,
                "accuracy_percent": round(accuracy, 2),
                "is_potential_scammer": is_potential_scammer,
                "reason": f"Flagged due to low accuracy ({accuracy:.2f}%) across {total_predictions} predictions." if is_potential_scammer else "Not flagged."
            }
        print("✅ User accuracy evaluation complete.")
        return accuracy_report

    # ------------------ Full Pipeline ------------------
    # ------------------ Full Pipeline ------------------
    def run(self, chat_file_path, image_folder_path, sebi_id_to_check, insider_csv_path=None):
        print("🚀 Starting Full Analysis Pipeline...")
        final_report = defaultdict(dict)
        final_report["manual_verifications"] = {}

        final_report["manual_verifications"]["sebi_id_analysis"] = self._verify_sebi_id(sebi_id_to_check)
        bulk_deals_df = self._get_bse_bulk_deals()
        final_report["bulk_deal_analysis"] = {"trades_found_today": len(bulk_deals_df)}
        
        all_messages = self._parse_chat_for_messages(chat_file_path)
        final_report["message_by_message_analysis"] = []

        # Loop through all messages but only add relevant ones to the report
        for msg_data in all_messages:
            text = msg_data["message"]

            # First, check if the message contains a URL or a stock to see if it's worth processing
            urls = re.findall(r"(https?://\S+)", text)
            stock_analysis = self._analyze_stock_mentions(text)

            # --- FILTERING LOGIC IS HERE ---
            # Only proceed if the message contains a URL or a stock mention
            if urls or stock_analysis:
                
                # Now perform the rest of the analysis on this relevant message
                prediction = self.model.predict(self.vectorizer.transform([text]))[0]
                analysis = {"text_scam_prediction": str(prediction)}

                if urls:
                    analysis["url_analysis"] = [self._check_url_risk(url) for url in urls]

                if stock_analysis:
                    analysis["stock_mention_analysis"] = stock_analysis

                # Append the fully analyzed, relevant message to the report
                final_report["message_by_message_analysis"].append({
                    "date": msg_data["date"],
                    "user": msg_data["user"],
                    "message": text,
                    "analysis": analysis
                })

        # All downstream analysis will now run ONLY on the filtered messages
        predictions = self._verify_stock_predictions_batch(final_report["message_by_message_analysis"])
        for msg in final_report["message_by_message_analysis"]:
            if msg.get("date") in predictions:
                msg["analysis"]["stock_prediction_verification"] = predictions[msg["date"]]
            else:
                msg["analysis"]["stock_prediction_verification"] = []

        if insider_csv_path and os.path.exists(insider_csv_path):
            insider_df = pd.read_csv(insider_csv_path)
        else:
            insider_df = pd.DataFrame()
        required_cols = ['SYMBOL', 'DATE OF ALLOTMENT/ACQUISITION FROM', 'NAME OF THE ACQUIRER/DISPOSER', 'CATEGORY OF PERSON', 'ACQUISITION/DISPOSAL TRANSACTION TYPE']
        for col in required_cols:
            if col not in insider_df.columns:
                insider_df[col] = None
        final_report["insider_trading_flags"] = self._check_insider_trading(final_report["message_by_message_analysis"], insider_df)

        final_report["sentiment_price_flags"] = self._sentiment_price_check(final_report["message_by_message_analysis"])

        # Image analysis remains unchanged and will process all images
        image_files = glob.glob(os.path.join(image_folder_path, "*"))
        final_report["image_analysis"] = []
        for img_path in image_files:
            img_report = self._analyze_image_content(img_path)
            final_report["image_analysis"].append({
                "file_name": os.path.basename(img_path),
                "analysis": img_report
            })

        user_accuracy_report = self._evaluate_user_prediction_accuracy(final_report["message_by_message_analysis"])
        final_report["user_prediction_accuracy_report"] = user_accuracy_report

        print("✅ Pipeline finished successfully!")
        return final_report

In [None]:
import os
import json # Import the json library
from collections import Counter

# Define paths
WORKING_DIR = "/kaggle/working/"
VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")
STOCK_PATH = "/kaggle/input/nse-stocks/EQUITY_L.csv"

# Input data paths
chat_file = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/chat/Extracted messages....txt"
image_folder = "/kaggle/input/sebi-sample-input/dataset/stock_market_Trader_Public_Group/images"

# Entity information
sebi_id_input = "INZ000048660"
entity_name_input = "NEO APEX VENTURE LLP"

# The print_pipeline_report function is no longer needed and has been removed.

# Main execution
try:
    # Assuming FraudDetectionPipeline class is defined in another cell or file
    fraud_detector = FraudDetectionPipeline(
        model_path=MODEL_PATH, 
        vectorizer_path=VECTORIZER_PATH, 
        stock_csv_path=STOCK_PATH
    )
    
    if os.path.exists(chat_file):
        results = fraud_detector.run(
            chat_file_path=chat_file,
            image_folder_path=image_folder,
            sebi_id_to_check=sebi_id_input,
        )
        
        # --- MODIFIED PART ---
        # Instead of the custom print function, we now print the full JSON object.
        # The `indent=4` argument makes the output readable.
        print("\n" + "="*70)
        print("🚨 FULL PIPELINE REPORT (JSON)")
        print("="*70 + "\n")
        print(json.dumps(results, indent=4))
        print("\n" + "="*70)
        print("✅ End of Report")
        print("="*70 + "\n")

    else:
        print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")

except NameError:
    print("❌ ERROR: `FraudDetectionPipeline` class is not defined. Please ensure its code has been executed.")
except IOError as e:
    print(e)
    print("Please make sure the model and vectorizer files exist. You may need to train them first.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

🚀 Initializing Fraud Detection Pipeline...
✅ Pipeline initialized successfully.
🚀 Starting Full Analysis Pipeline...
