# üá±üá∞ CeylonPulse: Complete Data Collection System

**Real-Time Situational Awareness System for Sri Lanka**

This notebook contains **ALL** functionality from the Python modules - everything runs in Colab!

## Features:
- ‚úÖ RSS Feed Scraping
- ‚úÖ Web Scraping  
- ‚úÖ Google Trends API
- ‚úÖ Twitter API (optional)
- ‚úÖ Signal Detection (40 PESTLE signals)
- ‚úÖ Mistral 7B LLM Extraction
- ‚úÖ Data Storage (JSON)
- ‚úÖ TensorFlow Ready

**No need for local Python files - everything is here!**


## üì¶ Step 1: Install All Dependencies


In [None]:
# Install all required packages
%pip install -q requests beautifulsoup4 feedparser lxml
%pip install -q pytrends python-dateutil
%pip install -q pandas numpy
%pip install -q tensorflow

print("‚úÖ All packages installed successfully!")


## üîß Step 2: Configuration & Setup


In [None]:
import sys
import os
import json
import re
from datetime import datetime
from typing import List, Dict
from collections import Counter
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import feedparser

# Hugging Face Token (for Mistral 7B)
HUGGINGFACE_API_TOKEN = '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'

# Configuration
USE_LLM = True  # Set to True to use Mistral 7B
USE_GOOGLE_TRENDS = True
USE_TWITTER = False  # Set to True if you have Twitter token

print("‚úÖ Libraries imported!")
print(f"‚úÖ Hugging Face token configured")
print(f"‚úÖ LLM Extraction: {'Enabled' if USE_LLM else 'Disabled'}")


## üìã Step 3: Load 40 PESTLE Signals & Data Sources


In [None]:
# All 40 PESTLE Signals (from SSD)
SIGNALS = [
    "Government Policy Announcements", "Cabinet/Parliament Decisions",
    "Government Sector Strike Warnings", "Police/Security Alerts",
    "Election-related Discussions", "Foreign Policy / International Agreements",
    "Tax Revision Rumors", "Public Protests & Demonstrations",
    "Inflation Mentions", "Fuel Shortage Mentions", "Dollar Rate Discussions",
    "Tourism Search Trend (Google Trends)", "Food Price Spikes",
    "Stock Market Volatility", "Foreign Investment News",
    "Currency Black Market Mentions", "Crime & Safety Alerts",
    "Public Sentiment (Social Media)", "Migration / Visa Interest",
    "Public Health Discussions", "Viral Social Trends",
    "Cultural Event Mentions", "Power Outages (CEB)",
    "Telecom Outages", "Cyberattack Mentions",
    "E-commerce Growth Indicators", "Digital Payments Failure Reports",
    "New Regulations Affecting Businesses", "Court Rulings Impacting Industries",
    "Import/Export Restriction Changes", "Customs/Port Delays",
    "Rainfall Alerts", "Flood Warnings", "Heat Wave Alerts",
    "Landslide Warnings", "Cyclone Updates", "Air Quality Index Changes",
    "Drought Warnings", "Water Supply Cuts (NWSDB)",
    "Coastal Erosion / Tsunami Alerts"
]

# Data Source URLs
DATA_SOURCES = {
    'ada_derana': {
        'rss_feed': 'https://www.adaderana.lk/rss.php',
        'news_page': 'https://www.adaderana.lk/news.php',
        'breaking_news': 'https://www.adaderana.lk/breaking-news',
        'business': 'https://www.adaderana.lk/business-news'
    },
    'economynext': {
        'rss_feed': 'https://economynext.com/rss',
        'main_site': 'https://economynext.com/',
        'sri_lanka_news': 'https://economynext.com/c/sri-lanka',
        'business': 'https://economynext.com/c/business'
    },
    'met_department': {
        'warnings': 'http://www.meteo.gov.lk/index.php?option=com_content&view=article&id=94&Itemid=310&lang=en',
        'weather_forecast': 'http://www.meteo.gov.lk/index.php?option=com_content&view=article&id=96&Itemid=512&lang=en'
    },
    'central_bank': {
        'main_site': 'https://www.cbsl.gov.lk/',
        'news': 'https://www.cbsl.gov.lk/news',
        'statistics': 'https://www.cbsl.gov.lk/statistics'
    },
    'ceb': {
        'outage_notices': 'https://ceb.lk/outage-notices',
        'load_shedding': 'https://ceb.lk/load-shedding-schedule'
    },
    'nwsdb': {
        'announcements': 'https://www.waterboard.lk/announcements.html',
        'water_interruptions': 'https://www.waterboard.lk/water_interruptions.html'
    }
}

print(f"‚úÖ Loaded {len(SIGNALS)} PESTLE signals")
print(f"‚úÖ Configured {len(DATA_SOURCES)} data sources")


In [None]:
def scrape_rss_feed(url, source_name="Unknown"):
    """Scrape RSS feed and return articles"""
    try:
        feed = feedparser.parse(url)
        articles = []
        
        for entry in feed.entries:
            article = {
                'title': entry.get('title', ''),
                'link': entry.get('link', ''),
                'description': entry.get('description', ''),
                'published': entry.get('published', ''),
                'published_parsed': entry.get('published_parsed'),
                'source': feed.feed.get('title', source_name),
                'source_url': url,
                'author': entry.get('author', ''),
                'tags': [tag.get('term', '') for tag in entry.get('tags', [])],
                'scraped_at': datetime.utcnow().isoformat()
            }
            articles.append(article)
        
        return articles
    except Exception as e:
        print(f"‚ùå Error scraping RSS feed {url}: {str(e)}")
        return []

# Scrape RSS feeds
print("Scraping RSS feeds...")
all_articles = []

# Ada Derana
ada_articles = scrape_rss_feed(DATA_SOURCES['ada_derana']['rss_feed'], 'Ada Derana')
all_articles.extend(ada_articles)
print(f"‚úÖ Scraped {len(ada_articles)} articles from Ada Derana")

# EconomyNext
econ_articles = scrape_rss_feed(DATA_SOURCES['economynext']['rss_feed'], 'EconomyNext')
all_articles.extend(econ_articles)
print(f"‚úÖ Scraped {len(econ_articles)} articles from EconomyNext")

print(f"\nüìä Total articles scraped: {len(all_articles)}")


In [None]:
if USE_GOOGLE_TRENDS:
    try:
        from pytrends.request import TrendReq
        
        def get_google_trends(geo='LK'):
            """Get Google Trends data for Sri Lanka"""
            try:
                pytrends = TrendReq(hl='en-US', tz=360)
                trending = pytrends.trending_searches(pn=geo.lower())
                
                trends = []
                for idx, trend in enumerate(trending[0].head(20).values):
                    trend_data = {
                        'rank': idx + 1,
                        'keyword': trend[0] if isinstance(trend, list) else str(trend),
                        'geo': geo,
                        'source': 'Google Trends',
                        'scraped_at': datetime.utcnow().isoformat()
                    }
                    trends.append(trend_data)
                
                return trends
            except Exception as e:
                print(f"‚ö†Ô∏è Error getting Google Trends: {str(e)}")
                return []
        
        # Get trending searches
        trends = get_google_trends('LK')
        print(f"‚úÖ Retrieved {len(trends)} trending searches from Google Trends")
        
        if trends:
            df_trends = pd.DataFrame(trends)
            print("\nüìà Top 10 Trending Searches in Sri Lanka:")
            print(df_trends[['rank', 'keyword']].head(10).to_string(index=False))
    except Exception as e:
        print(f"‚ö†Ô∏è Google Trends not available: {str(e)}")
        trends = []
else:
    trends = []
    print("‚ö†Ô∏è Google Trends disabled")


## üéØ Step 6: Signal Detection (Keyword-based from SSD)


In [None]:
# Signal keywords mapping (from SSD - Signal Specification Document)
SIGNAL_KEYWORDS = {
    # Political Signals
    "Government Policy Announcements": ["policy", "tax", "cabinet approves", "budget", "government policy"],
    "Cabinet/Parliament Decisions": ["cabinet decision", "parliament decision", "cabinet meeting", "parliament approves"],
    "Government Sector Strike Warnings": ["strike", "trade union", "government sector strike", "union warning"],
    "Police/Security Alerts": ["police alert", "security alert", "police warning", "security threat"],
    "Election-related Discussions": ["election", "voting", "poll", "election campaign"],
    "Foreign Policy / International Agreements": ["foreign policy", "international agreement", "bilateral agreement"],
    "Tax Revision Rumors": ["tax revision", "tax increase", "tax cut", "tax change"],
    "Public Protests & Demonstrations": ["protest", "demonstration", "rally", "march", "protesters"],
    
    # Economic Signals
    "Inflation Mentions": ["inflation", "price increase", "cost of living", "inflation rate", "cpi"],
    "Fuel Shortage Mentions": ["fuel shortage", "petrol shortage", "diesel shortage", "fuel crisis", "fuel queues"],
    "Dollar Rate Discussions": ["dollar rate", "usd rate", "exchange rate", "rupee dollar", "currency rate"],
    "Tourism Search Trend (Google Trends)": ["tourism", "tourist", "visitor", "travel sri lanka", "hotel booking"],
    "Food Price Spikes": ["food price", "rice price", "vegetable price", "price spike"],
    "Stock Market Volatility": ["stock market", "cse", "share market", "market volatility"],
    "Foreign Investment News": ["foreign investment", "fdi", "foreign direct investment"],
    "Currency Black Market Mentions": ["black market", "underground market", "illegal currency"],
    
    # Social Signals
    "Crime & Safety Alerts": ["crime", "robbery", "theft", "murder", "safety alert"],
    "Public Sentiment (Social Media)": ["public sentiment", "social media", "twitter", "facebook"],
    "Migration / Visa Interest": ["migration", "emigration", "visa", "immigration"],
    "Public Health Discussions": ["disease", "outbreak", "epidemic", "health alert"],
    "Viral Social Trends": ["viral", "trending", "social media trend"],
    "Cultural Event Mentions": ["cultural event", "festival", "celebration"],
    
    # Technological Signals
    "Power Outages (CEB)": ["power outage", "power cut", "load shedding", "ceb", "electricity cut"],
    "Telecom Outages": ["telecom outage", "internet outage", "network outage"],
    "Cyberattack Mentions": ["cyberattack", "cyber attack", "hacking", "data breach"],
    "E-commerce Growth Indicators": ["e-commerce", "online shopping", "digital commerce"],
    "Digital Payments Failure Reports": ["payment failure", "digital payment", "payment system down"],
    
    # Legal Signals
    "New Regulations Affecting Businesses": ["regulation", "new regulation", "business regulation"],
    "Court Rulings Impacting Industries": ["court ruling", "court decision", "legal ruling"],
    "Import/Export Restriction Changes": ["import restriction", "export restriction", "import ban"],
    "Customs/Port Delays": ["customs delay", "port delay", "customs clearance"],
    
    # Environmental Signals
    "Rainfall Alerts": ["rainfall", "heavy rain", "rain alert", "rainfall warning", "monsoon"],
    "Flood Warnings": ["flood", "flooding", "flood warning", "flood alert", "flash flood"],
    "Heat Wave Alerts": ["heat wave", "heatwave", "extreme heat", "high temperature"],
    "Landslide Warnings": ["landslide", "landslide warning", "mudslide"],
    "Cyclone Updates": ["cyclone", "tropical cyclone", "storm", "cyclone warning"],
    "Air Quality Index Changes": ["air quality", "aqi", "air pollution"],
    "Drought Warnings": ["drought", "drought warning", "water shortage"],
    "Water Supply Cuts (NWSDB)": ["water supply cut", "water cut", "water interruption", "nwsdb"],
    "Coastal Erosion / Tsunami Alerts": ["tsunami", "tsunami alert", "tsunami warning", "coastal erosion"]
}

def detect_signals(text, title="", source=""):
    """Detect signals from text using keyword matching (SSD-based)"""
    if not text and not title:
        return []
    
    full_text = f"{title} {text}".lower()
    detected = []
    
    for signal_name, keywords in SIGNAL_KEYWORDS.items():
        matches = []
        for keyword in keywords:
            pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
            if re.search(pattern, full_text):
                matches.append(keyword)
        
        # Source-specific detection
        source_match = False
        source_lower = source.lower()
        signal_lower = signal_name.lower()
        
        if "power outage" in signal_lower or "ceb" in signal_lower:
            if "ceb" in source_lower or "electricity" in source_lower:
                source_match = True
        if "water supply" in signal_lower or "nwsdb" in signal_lower:
            if "nwsdb" in source_lower or "water board" in source_lower:
                source_match = True
        if any(term in signal_lower for term in ["rainfall", "flood", "cyclone"]):
            if "met" in source_lower or "meteorological" in source_lower:
                source_match = True
        
        if matches or source_match:
            confidence = min(0.5 + (len(matches) * 0.15) + (0.2 if source_match else 0), 1.0)
            detected.append({
                'signal_name': signal_name,
                'confidence': round(confidence, 2),
                'matched_keywords': matches[:5],
                'source_specific_match': source_match
            })
    
    # Sort by confidence
    detected.sort(key=lambda x: x['confidence'], reverse=True)
    return detected

# Detect signals in all articles
print("Detecting signals in articles...")
for article in all_articles:
    signals = detect_signals(
        article.get('description', ''),
        article.get('title', ''),
        article.get('source', '')
    )
    article['detected_signals'] = signals
    article['signal_count'] = len(signals)

articles_with_signals = sum(1 for a in all_articles if a.get('detected_signals'))
print(f"‚úÖ Signal detection completed!")
print(f"üìä Articles with signals: {articles_with_signals} / {len(all_articles)}")

# Show sample
if articles_with_signals > 0:
    sample = next((a for a in all_articles if a.get('detected_signals')), None)
    if sample:
        print(f"\nüìù Sample detection:")
        print(f"   Title: {sample['title'][:60]}...")
        print(f"   Signals: {[s['signal_name'] for s in sample['detected_signals'][:3]]}")


In [None]:
if USE_LLM:
    MISTRAL_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
    API_URL = f"https://api-inference.huggingface.co/models/{MISTRAL_MODEL}"
    
    def extract_signals_mistral(text, title=""):
        """Extract signals using Mistral 7B Instruct"""
        prompt = f"""Analyze this news article and extract relevant signals from the 40 PESTLE signals.

Title: {title}
Content: {text[:1000]}

Available signals: {', '.join(SIGNALS[:20])}...

For each relevant signal found, return JSON:
{{"signals": [
  {{
    "signal_name": "Signal Name (must match exactly from list)",
    "confidence": 0.8,
    "pestle_category": "Political/Economic/Social/Technological/Legal/Environmental",
    "swot_category": "Threat/Opportunity/Weakness/Strength",
    "severity_estimate": 0.7,
    "key_phrases": ["phrase1", "phrase2"]
  }}
]}}

Only return valid JSON, nothing else."""
        
        # Format for Mistral Instruct
        formatted_prompt = f"<s>[INST] {prompt} [/INST]"
        
        headers = {}
        if HUGGINGFACE_API_TOKEN:
            headers["Authorization"] = f"Bearer {HUGGINGFACE_API_TOKEN}"
        
        payload = {
            "inputs": formatted_prompt,
            "parameters": {
                "max_new_tokens": 1000,
                "temperature": 0.3,
                "return_full_text": False
            }
        }
        
        try:
            response = requests.post(API_URL, headers=headers, json=payload, timeout=90)
            
            if response.status_code == 200:
                result = response.json()
                if isinstance(result, list) and len(result) > 0:
                    content = result[0].get('generated_text', '')
                else:
                    content = str(result)
                
                # Extract JSON from response
                json_match = re.search(r'\{.*\}', content, re.DOTALL)
                if json_match:
                    parsed = json.loads(json_match.group())
                    return parsed.get('signals', [])
                return []
            elif response.status_code == 503:
                print("‚ö†Ô∏è Model is loading, please wait 30-60 seconds and try again")
                return []
            else:
                print(f"‚ö†Ô∏è API error: {response.status_code}")
                return []
        except Exception as e:
            print(f"‚ö†Ô∏è LLM extraction error: {str(e)}")
            return []
    
    # Extract signals using LLM (test on first 5 articles)
    print("Extracting signals using Mistral 7B...")
    print("(First request may take 30-60 seconds - model loading)")
    
    llm_extracted_count = 0
    for i, article in enumerate(all_articles[:5]):  # Test on first 5
        text = article.get('description', '')
        title = article.get('title', '')
        
        if text or title:
            llm_signals = extract_signals_mistral(text, title)
            if llm_signals:
                # Merge with keyword-detected signals
                existing_signals = article.get('detected_signals', [])
                existing_names = {s['signal_name'] for s in existing_signals}
                
                for llm_sig in llm_signals:
                    if llm_sig.get('signal_name') not in existing_names:
                        existing_signals.append({
                            'signal_name': llm_sig.get('signal_name', ''),
                            'confidence': llm_sig.get('confidence', 0.0),
                            'detection_method': 'llm',
                            'pestle_category': llm_sig.get('pestle_category', ''),
                            'swot_category': llm_sig.get('swot_category', ''),
                            'severity_estimate': llm_sig.get('severity_estimate', 0.0)
                        })
                
                article['detected_signals'] = existing_signals
                article['signal_count'] = len(existing_signals)
                llm_extracted_count += 1
                print(f"  ‚úÖ Article {i+1}: Extracted {len(llm_signals)} additional signals")
    
    print(f"\n‚úÖ LLM extraction completed on {llm_extracted_count} articles")
else:
    print("‚ö†Ô∏è LLM extraction disabled")


In [None]:
# Combine all data
all_data = all_articles + trends

# Save to JSON
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f'/content/collected_data_{timestamp}.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Saved {len(all_data)} items to {output_file}")

# Also save to Drive if mounted
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    
    drive_file = f'/content/drive/MyDrive/CeylonPulse/data/collected_data_{timestamp}.json'
    os.makedirs(os.path.dirname(drive_file), exist_ok=True)
    with open(drive_file, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Also saved to Drive: {drive_file}")
except:
    print("‚ö†Ô∏è Drive not mounted (optional)")

# Create DataFrame for analysis
df = pd.DataFrame(all_data)
print(f"\nüìä Data Summary:")
print(f"Total items: {len(df)}")
if 'source' in df.columns:
    print(f"\nSources:")
    print(df['source'].value_counts())

# Signal statistics
if 'detected_signals' in df.columns:
    all_signals = []
    for item in all_data:
        if item.get('detected_signals'):
            all_signals.extend(item['detected_signals'])
    
    if all_signals:
        signal_counts = Counter(s['signal_name'] for s in all_signals)
        print(f"\nüìà Top 10 Detected Signals:")
        for signal, count in signal_counts.most_common(10):
            print(f"   {signal}: {count}")


## üß† Step 9: Prepare for TensorFlow (NLP Preprocessing)


In [None]:
# Import TensorFlow
import tensorflow as tf
from tensorflow import keras

print(f"‚úÖ TensorFlow {tf.__version__} imported")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")

# Text preprocessing for TensorFlow
def preprocess_text(text):
    """Basic text preprocessing"""
    if not text:
        return ""
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters (keep alphanumeric and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Lowercase and strip
    return text.lower().strip()

# Preprocess all text data
if 'description' in df.columns:
    df['processed_text'] = df['description'].apply(preprocess_text)
elif 'text' in df.columns:
    df['processed_text'] = df['text'].apply(preprocess_text)

print("‚úÖ Text preprocessing completed - ready for TensorFlow models!")
print(f"\nSample processed text:")
if 'processed_text' in df.columns and len(df) > 0:
    sample = df['processed_text'].iloc[0]
    print(f"   {sample[:200]}...")


## üìä Step 10: Summary & Statistics


In [None]:
print("=" * 60)
print("CeylonPulse Data Collection Summary")
print("=" * 60)
print(f"‚úÖ Total items collected: {len(all_data)}")
print(f"   - Articles from RSS: {len(all_articles)}")
print(f"   - Trends from Google: {len(trends)}")
print(f"\n‚úÖ Signal Detection:")
print(f"   - Articles with signals: {articles_with_signals}")
print(f"   - Total signal detections: {sum(len(a.get('detected_signals', [])) for a in all_articles)}")
print(f"\n‚úÖ Data Storage:")
print(f"   - Saved to: {output_file}")
print(f"   - File size: {os.path.getsize(output_file) / 1024:.1f} KB")
print(f"\n‚úÖ Next Steps:")
print("   - Review collected data")
print("   - Proceed to Step 3: NLP Preprocessing (SBERT embeddings)")
print("   - Proceed to Step 4: Deep Learning Models (BERT, LSTM)")
print("=" * 60)

# Display sample data
if len(all_data) > 0:
    print(f"\nüìù Sample Article:")
    sample = all_data[0]
    print(f"   Title: {sample.get('title', 'N/A')[:70]}...")
    print(f"   Source: {sample.get('source', 'N/A')}")
    if sample.get('detected_signals'):
        print(f"   Signals: {[s['signal_name'] for s in sample['detected_signals'][:3]]}")
