In [3]:
!pip install python-whois pyzbar

Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Collecting pyzbar
  Downloading pyzbar-0.1.9-py2.py3-none-any.whl.metadata (10 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyzbar-0.1.9-py2.py3-none-any.whl (32 kB)
Installing collected packages: pyzbar, python-whois
Successfully installed python-whois-0.9.5 pyzbar-0.1.9


In [4]:
import requests
import pandas as pd
from io import StringIO

def get_bulk_deal_client_frequency():
    """
    Fetches daily bulk deal data from the BSE and calculates the frequency
    of each client name involved in the trades.
    """
    print("Fetching daily bulk deal data from BSE...")
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        all_tables = pd.read_html(StringIO(response.text))
        
        deals_df = None
        for table in all_tables:
            if 'Client Name' in table.columns:
                deals_df = table
                break
        
        if deals_df is None or deals_df.empty:
            print("❌ Could not find or parse the bulk deals table today.")
            return None

        # Use value_counts() to get the frequency of each name
        name_counts = deals_df['Client Name'].value_counts()
        print("✅ Successfully calculated client name frequencies.")
        return name_counts

    except Exception as e:
        print(f"❌ An error occurred: {e}")
        return None

if __name__ == "__main__":
    client_frequencies = get_bulk_deal_client_frequency()
    
    if client_frequencies is not None and not client_frequencies.empty:
        print("\n" + "="*50)
        print("Frequency of Client Names in Today's Bulk Deals")
        print("="*50)
        # The result from value_counts() is a pandas Series
        for name, count in client_frequencies.items():
            print(f"- {name}: {count} trade(s)")
    else:
        print("\nNo bulk deals to analyze today.")

Fetching daily bulk deal data from BSE...
✅ Successfully calculated client name frequencies.

Frequency of Client Names in Today's Bulk Deals
- IRAGE BROKING SERVICES LLP: 6 trade(s)
- NEO APEX VENTURE LLP: 5 trade(s)
- MANSI SHARE AND STOCK BROKING PRIVATE LIMITED: 4 trade(s)
- NEO APEX SHARE BROKING SERVICES LLP: 4 trade(s)
- PRAS INVESTMENT PRIVATE LIMITED: 3 trade(s)
- PARNIT VENTURES PRIVATE LIMITED: 2 trade(s)
- B N RATHI SECURITIES LIMITED: 2 trade(s)
- AKSHAY PALIWAL: 2 trade(s)
- SYLPH TECHNOLOGIES LIMITED: 2 trade(s)
- SAROJDEVI P GUPTA: 2 trade(s)
- BHATIA NIKHIL MURLIDHAR: 2 trade(s)
- ISHAAN TRADEFIN LLP: 2 trade(s)
- F3 ADVISORS PRIVATE LIMITED: 2 trade(s)
- QE SECURITIES LLP: 2 trade(s)
- VARANGA PROPERTIES PRIVATE LIMITED: 2 trade(s)
- PRASHANT GUPTA: 2 trade(s)
- VIKRAMKUMAR KARANRAJ SAKARIA HUF: 2 trade(s)
- CHAUHAN NAGJIBHAI CHANDUBHAI: 2 trade(s)
- KAMLESH NAVINCHANDRA SHAH: 2 trade(s)
- SHAILESH DHAMELIYA: 2 trade(s)
- SHARE INDIA SECURITIES LIMITED: 2 trade(s)
- N

In [5]:
import requests
import pandas as pd
import re
from io import StringIO
from datetime import datetime

def get_bse_bulk_deals():
    """Fetches and parses daily bulk deal data from the BSE website."""
    print("Fetching daily bulk deal data...")
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        all_tables = pd.read_html(StringIO(response.text))
        for table in all_tables:
            if 'Client Name' in table.columns and 'Deal Date' in table.columns:
                print("✅ Bulk deal data fetched.")
                table.columns = table.columns.str.strip()
                # Convert date columns to datetime objects for comparison
                table['Deal Date'] = pd.to_datetime(table['Deal Date'], format='%d/%m/%Y')
                return table
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ Error fetching BSE bulk deal data: {e}")
        return pd.DataFrame()

def parse_chat_for_advisor_mentions(file_path, advisor_name):
    """
    Parses a chat log to find all stock mentions made by a specific advisor.
    A stock is assumed to be any word in all caps with 3 or more letters.
    """
    print(f"Parsing chat log for messages by '{advisor_name}'...")
    stock_mentions = []
    stock_pattern = re.compile(r'\b[A-Z]{3,}\b')
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Match lines that contain a date, time, and the specific advisor's name
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}),\s\d{2}:\d{2}\s-\s(.*?):\s(.*)', line)
            if match and advisor_name.lower() in match.group(2).lower():
                date_str, user, message = match.groups()
                message_date = datetime.strptime(date_str, '%d/%m/%y') # Assuming dd/mm/yy format
                
                # Find all potential stock symbols in the message
                symbols = stock_pattern.findall(message)
                for symbol in symbols:
                    stock_mentions.append({
                        "date": message_date,
                        "stock_symbol": symbol,
                        "message": message.strip()
                    })
    print(f"✅ Found {len(stock_mentions)} stock mentions by the advisor.")
    return stock_mentions

def cross_correlation_engine(chat_file, advisor_name):
    """
    The main engine to correlate chat mentions with bulk trades.
    """
    # Step 1: Get all recent bulk trades
    trades_df = get_bse_bulk_deals()
    if trades_df.empty:
        return ["Could not retrieve bulk deal data to perform analysis."]

    # Filter for trades made only by our target advisor
    advisor_trades = trades_df[trades_df['Client Name'].str.contains(advisor_name, case=False, na=False)].copy()
    if advisor_trades.empty:
        return [f"No recent bulk deals found for '{advisor_name}'. No correlation possible."]
    
    # Step 2: Get all stock mentions by the advisor from the chat log
    advisor_mentions = parse_chat_for_advisor_mentions(chat_file, advisor_name)
    if not advisor_mentions:
        return [f"No stock mentions found for '{advisor_name}' in the chat log."]
        
    # Step 3: Correlate mentions and trades to find red flags
    print("\nCorrelating trades and chat messages...")
    red_flags = []
    
    for mention in advisor_mentions:
        for _, trade in advisor_trades.iterrows():
            # Check if the mentioned stock name is part of the security name in the trade data
            if mention['stock_symbol'].lower() in trade['Security Name'].lower():
                time_delta = mention['date'] - trade['Deal Date']
                
                # --- Red Flag 1: Front-Running ---
                # Promotion happens within 7 days AFTER a bulk BUY
                if trade['Deal Type'].lower() == 'buy' and 0 <= time_delta.days <= 7:
                    flag = (f"🚨 POTENTIAL FRONT-RUNNING DETECTED:\n"
                            f"  -> Advisor promoted '{mention['stock_symbol']}' on {mention['date'].date()}\n"
                            f"  -> This was just {time_delta.days} day(s) AFTER their bulk BUY of {trade['Quantity']} shares on {trade['Deal Date'].date()}.\n")
                    red_flags.append(flag)

                # --- Red Flag 2: Pump & Dump ---
                # A bulk SELL happens within 30 days AFTER a promotion
                if trade['Deal Type'].lower() == 'sell' and 0 <= time_delta.days <= 30:
                    flag = (f"🚨 POTENTIAL PUMP & DUMP DETECTED:\n"
                            f"  -> Advisor made a bulk SELL of {trade['Quantity']} shares of {trade['Security Name']} on {trade['Deal Date'].date()}\n"
                            f"  -> This was {time_delta.days} day(s) AFTER they promoted '{mention['stock_symbol']}' on {mention['date'].date()}.\n")
                    red_flags.append(flag)

    return red_flags if red_flags else ["No suspicious correlations found between chat messages and recent bulk deals."]

if __name__ == "__main__":
    # --- 🔽 INPUTS FOR THE ANALYSIS 🔽 ---
    # 1. Path to your chat log file
    chat_log_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt" #<-- CHANGE THIS
    
    # 2. The exact name of the advisor as it appears in the chat and bulk deal data
    advisor_name_to_investigate = "NEO APEX VENTURE LLP" #<-- CHANGE THIS
    
    # --- Run the engine ---
    if not os.path.exists(chat_log_file):
        print(f"❌ ERROR: Chat file not found at '{chat_log_file}'")
    else:
        suspicious_activities = cross_correlation_engine(chat_log_file, advisor_name_to_investigate)
        
        print("\n" + "="*80)
        print("          Cross-Correlation Analysis Report")
        print("="*80)
        for activity in suspicious_activities:
            print(activity)

Fetching daily bulk deal data...
✅ Bulk deal data fetched.
Parsing chat log for messages by 'NEO APEX VENTURE LLP'...
✅ Found 0 stock mentions by the advisor.

          Cross-Correlation Analysis Report
No stock mentions found for 'NEO APEX VENTURE LLP' in the chat log.


In [6]:
# ==============================================================================
# SECTION 1: INSTALLATIONS & IMPORTS
# ==============================================================================

# Install required packages
!pip install pandas scikit-learn joblib opencv-python easyocr python-whois requests beautifulsoup4 html5lib yfinance -q

# --- Standard Library Imports ---
import os
import re
import glob
import json
import datetime
from collections import Counter, defaultdict
from urllib.parse import urlparse
from io import StringIO

# --- Suppress Warnings ---
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings("ignore")

# --- Core Data Science & ML Imports ---
import joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# --- Image & Web Scraping Imports ---
import cv2
import easyocr
import whois
import requests
import yfinance as yf

print("✅ All libraries loaded successfully.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
# ==============================================================================
# SECTION 2: BASELINE MODEL TRAINING
# ==============================================================================

def train_scam_classifier():
    """
    Loads the dataset, trains a text classifier, and saves the model
    and vectorizer to the /kaggle/working/ directory.
    """
    print("🚀 Starting model training process...")
    try:
        # Define paths for the writable Kaggle directory
        WORKING_DIR = "/kaggle/working/"
        VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
        MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")

        # Load dataset
        df = pd.read_csv("/kaggle/input/whatsapp-scam/whatsapp_scam_dataset.csv")

        # Define features & labels
        X = df["message"]
        y = df["scam_type"]

        # Train-test split
        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Text vectorization
        vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
        X_train_vec = vectorizer.fit_transform(X_train)

        # Train model
        model = LogisticRegression(max_iter=200, class_weight="balanced")
        model.fit(X_train_vec, y_train)

        # Save the model and vectorizer
        joblib.dump(vectorizer, VECTORIZER_PATH)
        joblib.dump(model, MODEL_PATH)
        
        print(f"✅ Model and vectorizer saved successfully to {WORKING_DIR}")
        return True

    except Exception as e:
        print(f"❌ An error occurred during model training: {e}")
        print("    Ensure '/kaggle/input/whatsapp-scam/whatsapp_scam_dataset.csv' is added to your notebook.")
        return False

# Run the training
train_scam_classifier()

🚀 Starting model training process...
✅ Model and vectorizer saved successfully to /kaggle/working/


True

In [8]:
# ==============================================================================
# SECTION 3: COMPILED ANALYSIS PIPELINE
# ==============================================================================

# --- Helper Function: URL Intelligence ---
def check_url_risk(url):
    score, reasons = 0, []
    try:
        domain = urlparse(url).netloc
        if not url.startswith("https"): score += 30; reasons.append("URL is not secure (HTTP)")
        if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain): score += 40; reasons.append("URL uses an IP address")
        if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]): score += 25; reasons.append("URL uses a suspicious TLD")
        try:
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            if creation_date:
                creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                age_days = (datetime.datetime.now() - creation_date).days
                if age_days < 180: score += 30; reasons.append(f"Domain is very new ({age_days} days old)")
        except Exception: score += 10; reasons.append("WHOIS lookup failed")
    except Exception: return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
    risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
    return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

# --- Helper Function: Image & QR Intelligence ---
def check_qr_code(image_path):
    try:
        img = cv2.imread(image_path)
        if img is None: return None
        qr_detector = cv2.QRCodeDetector()
        data, _, _ = qr_detector.detectAndDecode(img)
        if data: return {"qr_data": data, "analysis": check_url_risk(data) if data.startswith("http") else {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}}
    except Exception: return None
    return None

def analyze_image_content(image_path, reader, text_model, text_vectorizer):
    results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": check_qr_code(image_path)}
    try:
        extracted_text = " ".join(reader.readtext(image_path, detail=0, paragraph=True))
        if extracted_text.strip():
            results["ocr_text"] = extracted_text
            vec = text_vectorizer.transform([extracted_text])
            results["text_scam_prediction"] = text_model.predict(vec)[0]
    except Exception: pass
    return results

# --- Helper Function: SEBI & Stock Mention (Placeholders) ---
def verify_sebi_id(sebi_id):
    if not sebi_id or not sebi_id.strip(): return {"sebi_id": "Not Provided", "status": "N/A"}
    print(f"⚠️  Running placeholder for SEBI ID: {sebi_id}")
    return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 10 and "INA" in sebi_id else "Not Found (Dummy)"}

def analyze_stock_mentions(text):
    stock_symbols = re.findall(r'\b[A-Z]{3,}\b', text)
    if not stock_symbols: return None
    suspicious_patterns = ["Message contains pump-and-dump keywords."] if any(kw in text.lower() for kw in ["guaranteed", "10x", "insider"]) else []
    return {"mentioned_stocks": list(set(stock_symbols)), "suspicious_patterns": suspicious_patterns}

# --- Helper Function: Chat Analysis ---
def parse_chat_for_advisor_mentions(file_path, advisor_name):
    stock_mentions, stock_pattern = [], re.compile(r'\b[A-Z]{3,}\b')
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            match = re.match(r'(\d{2}/\d{2}/\d{2,4}),\s\d{2}:\d{2}\s-\s(.*?):\s(.*)', line)
            if match and advisor_name.lower() in match.group(2).lower():
                date_str, _, message = match.groups()
                try: message_date = datetime.datetime.strptime(date_str, '%d/%m/%y')
                except ValueError: message_date = datetime.datetime.strptime(date_str, '%d/%m/%Y')
                for symbol in stock_pattern.findall(message):
                    stock_mentions.append({"date": message_date, "stock_symbol": symbol, "message": message.strip()})
    return stock_mentions

def detect_bot_behavior(file_path):
    with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines()
    users = [re.match(r'.*?-\s(.*?):', line).group(1).strip() for line in lines if re.match(r'.*?-\s(.*?):', line)]
    if not users: return {"error": "No users found."}
    user_counts = Counter(users)
    admin_name = user_counts.most_common(1)[0][0]
    return {"detected_admin": admin_name, "message_counts": dict(user_counts)}

# --- Helper Function: Bulk Deal & Cross-Correlation ---
def get_bse_bulk_deals():
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        for table in pd.read_html(StringIO(response.text)):
            if 'Client Name' in table.columns:
                table.columns = table.columns.str.strip()
                table['Deal Date'] = pd.to_datetime(table['Deal Date'], format='%d/%m/%Y')
                return table
    except Exception: return pd.DataFrame()
    return pd.DataFrame()

def cross_correlation_engine(chat_file, advisor_name, trades_df):
    if trades_df.empty or not advisor_name: return ["Bulk deal data not available or no advisor name provided."]
    advisor_trades = trades_df[trades_df['Client Name'].str.contains(advisor_name, case=False, na=False)]
    if advisor_trades.empty: return [f"No bulk deals found for '{advisor_name}'."]
    advisor_mentions = parse_chat_for_advisor_mentions(chat_file, advisor_name)
    if not advisor_mentions: return [f"No stock mentions by '{advisor_name}' found in chat."]
    red_flags = []
    for mention in advisor_mentions:
        for _, trade in advisor_trades.iterrows():
            if mention['stock_symbol'].lower() in trade['Security Name'].lower():
                time_delta = mention['date'] - trade['Deal Date']
                if trade['Deal Type'].lower() == 'buy' and 0 <= time_delta.days <= 7:
                    red_flags.append(f"🚨 POTENTIAL FRONT-RUNNING: Advisor promoted '{mention['stock_symbol']}' on {mention['date'].date()} just {time_delta.days} day(s) AFTER a bulk BUY.")
                if trade['Deal Type'].lower() == 'sell' and 0 <= time_delta.days <= 30:
                    red_flags.append(f"🚨 POTENTIAL PUMP & DUMP: Advisor made a bulk SELL of {trade['Security Name']} {time_delta.days} day(s) AFTER promoting '{mention['stock_symbol']}'.")
    return red_flags if red_flags else ["No suspicious correlations found."]

# --- THE MAIN PIPELINE FUNCTION ---
def run_analysis_pipeline(chat_file_path, image_folder_path, sebi_id_to_check, entity_name_to_track):
    """Orchestrates the entire fraud detection and analysis process."""
    print("🚀 Starting Full Analysis Pipeline...")
    WORKING_DIR = "/kaggle/working/"
    VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
    MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")
    try:
        vectorizer = joblib.load(VECTORIZER_PATH)
        model = joblib.load(MODEL_PATH)
        reader = easyocr.Reader(['en'], gpu=False) # Set gpu=False for Kaggle CPU environment
    except Exception as e: return {"error": f"Failed to load models or OCR: {e}"}

    final_report = defaultdict(dict)
    
    # 1. Perform one-time analyses
    print("📈 Analyzing market and chat-level data...")
    final_report["manual_verifications"]["sebi_id_analysis"] = verify_sebi_id(sebi_id_to_check)
    bulk_deals_df = get_bse_bulk_deals()
    final_report["bulk_deal_analysis"] = {"trades_found_today": len(bulk_deals_df)}
    final_report["cross_correlation_analysis"] = cross_correlation_engine(chat_file_path, entity_name_to_track, bulk_deals_df)
    final_report["chat_analysis"] = detect_bot_behavior(chat_file_path)

    # 2. Perform message-by-message and image analysis
    print("🔬 Analyzing individual messages and images...")
    messages = parse_chat_for_advisor_mentions(chat_file_path, "") # Get all messages
    scam_predictions = []
    final_report["message_by_message_analysis"] = []
    for msg_data in messages:
        text = msg_data["message"]
        analysis = {"text_scam_prediction": model.predict(vectorizer.transform([text]))[0]}
        scam_predictions.append(analysis["text_scam_prediction"])
        if re.search(r'(https?://\S+)', text):
            analysis["url_analysis"] = [check_url_risk(url) for url in re.findall(r'(https?://\S+)', text)]
        if analyze_stock_mentions(text):
            analysis["stock_mention_analysis"] = analyze_stock_mentions(text)
        if len(analysis) > 1:
            final_report["message_by_message_analysis"].append({"message": text, "analysis": analysis})
    
    image_files = glob.glob(os.path.join(image_folder_path, "*[.png|.jpg|.jpeg]"))
    final_report["image_analysis"] = []
    for img_path in image_files:
        final_report["image_analysis"].append({"file_name": os.path.basename(img_path), "analysis": analyze_image_content(img_path, reader, model, vectorizer)})

    # 3. Compile summary
    final_report["summary"] = {
        "total_messages_analyzed": len(messages),
        "total_images_analyzed": len(image_files),
        "scam_types_detected": dict(Counter(p for p in scam_predictions if p != "not_scam"))
    }
    print("\n✅ Pipeline finished successfully!")
    return final_report

In [None]:
# ==============================================================================
# SECTION 4: CONFIGURE AND RUN
# ==============================================================================

# --- 🔽 IMPORTANT: SET YOUR FILE PATHS AND INPUTS HERE 🔽 ---

# 1. Path to your uploaded chat TXT file
import os
chat_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt"

# 2. Path to your folder containing the images
image_folder = "/kaggle/input/scam-qr-png/"

# 3. The SEBI Registration ID you want to verify (can be left blank)
sebi_id_input = "INZ000048660"

# 4. The registered NAME of the advisor to track in chats and bulk deals
entity_name_input = "NEO APEX VENTURE LLP"

# --- 🔼 NO MORE CHANGES NEEDED BELOW THIS LINE 🔼 ---

# Run the main pipeline
if os.path.exists(chat_file):
    results = run_analysis_pipeline(
        chat_file_path=chat_file,
        image_folder_path=image_folder,
        sebi_id_to_check=sebi_id_input,
        entity_name_to_track=entity_name_input
    )
    
    # Print the final report as a clean JSON
    print("\n" + "="*50)
    print("          FINAL FRAUD DETECTION REPORT")
    print("="*50 + "\n")
    print(json.dumps(results, indent=4))
else:
    print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")

In [11]:
# Import all necessary libraries at the top
import os
import re
import json
import joblib
import whois
import cv2
import easyocr
import requests
import datetime
import pandas as pd
from io import StringIO
from urllib.parse import urlparse
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import glob

# ==============================================================================
# SECTION 1: THE PIPELINE CLASS DEFINITION
# ==============================================================================

class FraudDetectionPipeline:
    """
    A self-contained class to run the multi-modal scam detection pipeline.
    
    This class loads all necessary models and provides a single 'run' method
    to perform the analysis on the input files.
    """
    def __init__(self, model_path, vectorizer_path):
        """
        Initializes the pipeline by loading the ML model, vectorizer, and OCR reader.
        """
        print("🚀 Initializing Fraud Detection Pipeline...")
        try:
            self.model = joblib.load(model_path)
            self.vectorizer = joblib.load(vectorizer_path)
            # Initialize OCR reader (it will download models on first run)
            self.ocr_reader = easyocr.Reader(['en'], gpu=False) 
            print("✅ Pipeline initialized successfully.")
        except Exception as e:
            raise IOError(f"❌ Failed to load models. Ensure files exist at the specified paths. Error: {e}")

    # --- All your helper functions go here as methods of the class ---
    # Note that we add 'self' as the first argument to each function.

    def _check_url_risk(self, url):
        score, reasons = 0, []
        try:
            domain = urlparse(url).netloc
            if not url.startswith("https"): score += 30; reasons.append("URL is not secure (HTTP)")
            if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain): score += 40; reasons.append("URL uses an IP address")
            if any(domain.endswith(tld) for tld in [".xyz", ".top", ".biz", ".shop"]): score += 25; reasons.append("URL uses a suspicious TLD")
            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if creation_date:
                    creation_date = creation_date[0] if isinstance(creation_date, list) else creation_date
                    age_days = (datetime.datetime.now() - creation_date).days
                    if age_days < 180: score += 30; reasons.append(f"Domain is very new ({age_days} days old)")
            except Exception: score += 10; reasons.append("WHOIS lookup failed")
        except Exception: return {"url": url, "risk_level": "High", "risk_score": 100, "reasons": ["URL is malformed"]}
        risk_level = "High" if score >= 60 else "Medium" if score >= 30 else "Low"
        return {"url": url, "risk_level": risk_level, "risk_score": min(score, 100), "reasons": reasons}

    def _check_qr_code(self, image_path):
        try:
            img = cv2.imread(image_path)
            if img is None: return None
            qr_detector = cv2.QRCodeDetector()
            data, _, _ = qr_detector.detectAndDecode(img)
            if data: return {"qr_data": data, "analysis": self._check_url_risk(data) if data.startswith("http") else {"risk_level": "Low", "reasons": ["QR contains non-URL text"]}}
        except Exception: return None
        return None
        
    def _analyze_image_content(self, image_path):
        results = {"ocr_text": "", "text_scam_prediction": "N/A", "qr_analysis": self._check_qr_code(image_path)}
        try:
            extracted_text = " ".join(self.ocr_reader.readtext(image_path, detail=0, paragraph=True))
            if extracted_text.strip():
                results["ocr_text"] = extracted_text
                vec = self.vectorizer.transform([extracted_text])
                results["text_scam_prediction"] = self.model.predict(vec)[0]
        except Exception: pass
        return results

    def _verify_sebi_id(self, sebi_id):
        if not sebi_id or not sebi_id.strip(): return {"sebi_id": "Not Provided", "status": "N/A"}
        print(f"⚠️  Running placeholder for SEBI ID: {sebi_id}")
        return {"sebi_id": sebi_id, "status": "Verified (Dummy)" if len(sebi_id) > 10 and "INA" in sebi_id else "Not Found (Dummy)"}

    def _analyze_stock_mentions(self, text):
        stock_symbols = re.findall(r'\b[A-Z]{3,}\b', text)
        if not stock_symbols: return None
        suspicious_patterns = ["Message contains pump-and-dump keywords."] if any(kw in text.lower() for kw in ["guaranteed", "10x", "insider"]) else []
        return {"mentioned_stocks": list(set(stock_symbols)), "suspicious_patterns": suspicious_patterns}

    def _parse_chat_for_messages(self, file_path, user_filter=""):
        messages = []
        stock_pattern = re.compile(r'\b[A-Z]{3,}\b')
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                match = re.match(r'(\d{2}/\d{2}/\d{2,4}),\s\d{2}:\d{2}\s-\s(.*?):\s(.*)', line)
                if match:
                    date_str, user, message = match.groups()
                    if user_filter.lower() in user.lower():
                        try: message_date = datetime.datetime.strptime(date_str, '%d/%m/%y')
                        except ValueError: message_date = datetime.datetime.strptime(date_str, '%d/%m/%Y')
                        messages.append({
                            "date": message_date,
                            "user": user,
                            "stock_symbols": stock_pattern.findall(message),
                            "message": message.strip()
                        })
        return messages

    def _detect_bot_behavior(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines()
        users = [re.match(r'.*?-\s(.*?):', line).group(1).strip() for line in lines if re.match(r'.*?-\s(.*?):', line)]
        if not users: return {"error": "No users found."}
        user_counts = Counter(users)
        admin_name = user_counts.most_common(1)[0][0]
        return {"detected_admin": admin_name, "message_counts": dict(user_counts)}

    def _get_bse_bulk_deals(self):
        try:
            url = "https://www.bseindia.com/markets/equity/EQReports/bulk_deals.aspx"
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            for table in pd.read_html(StringIO(response.text)):
                if 'Client Name' in table.columns:
                    table.columns = table.columns.str.strip()
                    table['Deal Date'] = pd.to_datetime(table['Deal Date'], format='%d/%m/%Y')
                    return table
        except Exception: return pd.DataFrame()
        return pd.DataFrame()

    def _cross_correlation_engine(self, chat_file, advisor_name, trades_df):
        if trades_df.empty or not advisor_name: return ["Bulk deal data not available or no advisor name provided."]
        advisor_trades = trades_df[trades_df['Client Name'].str.contains(advisor_name, case=False, na=False)]
        if advisor_trades.empty: return [f"No bulk deals found for '{advisor_name}'."]
        
        advisor_mentions = self._parse_chat_for_messages(chat_file, user_filter=advisor_name)
        if not advisor_mentions: return [f"No stock mentions by '{advisor_name}' found in chat."]
        
        red_flags = []
        for mention in advisor_mentions:
            for symbol in mention['stock_symbols']:
                for _, trade in advisor_trades.iterrows():
                    if symbol.lower() in trade['Security Name'].lower():
                        time_delta = mention['date'] - trade['Deal Date']
                        if trade['Deal Type'].lower() == 'buy' and 0 <= time_delta.days <= 7:
                            red_flags.append(f"🚨 POTENTIAL FRONT-RUNNING: Advisor promoted '{symbol}' on {mention['date'].date()} just {time_delta.days} day(s) AFTER a bulk BUY.")
                        if trade['Deal Type'].lower() == 'sell' and 0 <= time_delta.days <= 30:
                            red_flags.append(f"🚨 POTENTIAL PUMP & DUMP: Advisor made a bulk SELL of {trade['Security Name']} {time_delta.days} day(s) AFTER promoting '{symbol}'.")
        return red_flags if red_flags else ["No suspicious correlations found."]
    
    # --- THE MAIN 'RUN' METHOD ---
    def run(self, chat_file_path, image_folder_path, sebi_id_to_check, entity_name_to_track):
        """
        Orchestrates the entire fraud detection and analysis process.
        """
        print("🚀 Starting Full Analysis Pipeline...")
        final_report = defaultdict(dict)
        
        # 1. Perform one-time analyses
        print("📈 Analyzing market and chat-level data...")
        final_report["manual_verifications"]["sebi_id_analysis"] = self._verify_sebi_id(sebi_id_to_check)
        bulk_deals_df = self._get_bse_bulk_deals()
        final_report["bulk_deal_analysis"] = {"trades_found_today": len(bulk_deals_df)}
        final_report["cross_correlation_analysis"] = self._cross_correlation_engine(chat_file_path, entity_name_to_track, bulk_deals_df)
        final_report["chat_analysis"] = self._detect_bot_behavior(chat_file_path)

        # 2. Perform message-by-message and image analysis
        print("🔬 Analyzing individual messages and images...")
        all_messages = self._parse_chat_for_messages(chat_file_path) # Get all messages
        scam_predictions = []
        final_report["message_by_message_analysis"] = []
        for msg_data in all_messages:
            text = msg_data["message"]
            analysis = {"text_scam_prediction": self.model.predict(self.vectorizer.transform([text]))[0]}
            scam_predictions.append(analysis["text_scam_prediction"])
            
            if re.search(r'(https?://\S+)', text):
                analysis["url_analysis"] = [self._check_url_risk(url) for url in re.findall(r'(https?://\S+)', text)]
            
            stock_analysis = self._analyze_stock_mentions(text)
            if stock_analysis:
                analysis["stock_mention_analysis"] = stock_analysis

            if len(analysis) > 1: # Only add if there's more than just the scam prediction
                final_report["message_by_message_analysis"].append({"message": text, "analysis": analysis})
        
        image_files = glob.glob(os.path.join(image_folder_path, "*[.png|.jpg|.jpeg]"))
        final_report["image_analysis"] = []
        for img_path in image_files:
            final_report["image_analysis"].append({
                "file_name": os.path.basename(img_path),
                "analysis": self._analyze_image_content(img_path)
            })

        # 3. Compile summary
        final_report["summary"] = {
            "total_messages_analyzed": len(all_messages),
            "total_images_analyzed": len(image_files),
            "scam_types_detected": dict(Counter(p for p in scam_predictions if p != "not_scam"))
        }
        print("\n✅ Pipeline finished successfully!")
        return final_report

In [None]:
# ==============================================================================
# SECTION 2: CONFIGURE AND RUN THE PIPELINE
# ==============================================================================

# First, ensure your baseline model is trained and saved.
# You only need to run this function once to create the .pkl files.
# train_scam_classifier() # Assuming this was already run and files are saved.

# --- 🔽 SET YOUR FILE PATHS AND INPUTS HERE 🔽 ---
WORKING_DIR = "/kaggle/working/"
VECTORIZER_PATH = os.path.join(WORKING_DIR, "vectorizer.pkl")
MODEL_PATH = os.path.join(WORKING_DIR, "scam_model.pkl")

# Input data paths
chat_file = "/kaggle/input/whatsap-chats-downloaded/WhatsApp Chat with TriDevs.txt"
image_folder = "/kaggle/input/scam-qr-png/"

# Entity information
sebi_id_input = "INZ000048660"
entity_name_input = "NEO APEX VENTURE LLP"

# --- 🔼 NO MORE CHANGES NEEDED BELOW THIS LINE 🔼 ---

try:
    # 1. Create an instance of the pipeline. This loads all models.
    fraud_detector = FraudDetectionPipeline(model_path=MODEL_PATH, vectorizer_path=VECTORIZER_PATH)
    
    # 2. Run the full analysis with one simple command.
    if os.path.exists(chat_file):
        results = fraud_detector.run(
            chat_file_path=chat_file,
            image_folder_path=image_folder,
            sebi_id_to_check=sebi_id_input,
            entity_name_to_track=entity_name_input
        )
        
        # 3. Print the final report.
        print("\n" + "="*50)
        print("                 FINAL FRAUD DETECTION REPORT")
        print("="*50 + "\n")
        print(json.dumps(results, indent=4, default=str)) # Use default=str to handle datetime objects
    else:
        print(f"❌ ERROR: The chat file was not found at the path: {chat_file}")

except IOError as e:
    print(e)
    print("Please run the 'train_scam_classifier()' function first to generate model files.")