In [3]:
import os
import json
import asyncio
import logging
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
from telethon import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument
import requests
from bs4 import BeautifulSoup

# Load environment variables
load_dotenv()

API_ID = os.getenv("TELEGRAM_API_ID")
API_HASH = os.getenv("TELEGRAM_API_HASH")

print("✅ Environment loaded successfully!")
print(f"API_ID: {'✓' if API_ID else '✗'}")
print(f"API_HASH: {'✓' if API_HASH else '✗'}")

✅ Environment loaded successfully!
API_ID: ✓
API_HASH: ✓


In [4]:
# Setup comprehensive logging
def setup_logging():
    """Setup comprehensive logging for the scraper"""
    log_dir = Path("../data/logs")
    log_dir.mkdir(parents=True, exist_ok=True)
    
    # Create formatter
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    
    # File handler for all logs
    file_handler = logging.FileHandler(
        log_dir / f"telegram_scraper_{datetime.now().strftime('%Y%m%d')}.log"
    )
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    
    # Console handler for important logs
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    
    # Setup logger
    logger = logging.getLogger('telegram_scraper')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    return logger

# Create directory structure for organized data storage
def create_directory_structure(base_date):
    """Create partitioned directory structure for raw data"""
    base_path = Path("../data/raw/telegram_messages") / base_date
    images_path = Path("../data/raw/telegram_images") / base_date
    
    
    base_path.mkdir(parents=True, exist_ok=True)
    images_path.mkdir(parents=True, exist_ok=True)
    
    return base_path, images_path

# Initialize logging
logger = setup_logging()
logger.info("Telegram scraper notebook initialized")

# Create today's directory structure
today = datetime.now().strftime("%Y-%m-%d")
messages_dir, images_dir = create_directory_structure(today)

print(f"📁 Directory structure created:")
print(f"   Messages: {messages_dir}")
print(f"   Images: {images_dir}")
print(f"📝 Logs: data/logs/")

2025-07-13 23:11:11,076 - telegram_scraper - INFO - Telegram scraper notebook initialized


📁 Directory structure created:
   Messages: ..\data\raw\telegram_messages\2025-07-13
   Images: ..\data\raw\telegram_images\2025-07-13
📝 Logs: data/logs/


In [5]:
# Ethiopian Medical Telegram Channels Discovery
def discover_ethiopian_medical_channels():
    """
    Discover and organize Ethiopian medical Telegram channels
    This includes known channels and channels from et.tgstat.com/medicine
    """
    
    # Core verified Ethiopian medical channels
    verified_channels = {
        "lobelia4cosmetics": {
            "name": "Lobelia Pharmacy and Cosmetics",
            "url": "https://t.me/lobelia4cosmetics",
            "category": "pharmacy_cosmetics",
            "verified": True,
            "priority": "high"
        },
        "tikvahpharma": {
            "name": "Tikvah Pharma",
            "url": "https://t.me/tikvahpharma", 
            "category": "pharmacy",
            "verified": True,
            "priority": "high"
        },
        "CheMed123": {
            "name": "CheMed",
            "url": "https://t.me/CheMed123",
            "category": "medical_equipment",
            "verified": True,
            "priority": "high"
        }
    }
    
    
    
    # Combine all channels
    all_channels = {**verified_channels}
    
    return verified_channels

# Discover channels
verified_channels = discover_ethiopian_medical_channels()

print("🔍 Ethiopian Medical Telegram Channels Discovered:")
print(f"✅ Verified channels: {len(verified_channels)}")

for username, info in verified_channels.items():
    print(f"   📋 {username}: {info['name']} ({info['category']})")

# Use only verified channels for scraping
channels_to_scrape = list(verified_channels.keys())
print(f"\n🎯 Channels selected for scraping: {channels_to_scrape}")

🔍 Ethiopian Medical Telegram Channels Discovered:
✅ Verified channels: 3
   📋 lobelia4cosmetics: Lobelia Pharmacy and Cosmetics (pharmacy_cosmetics)
   📋 tikvahpharma: Tikvah Pharma (pharmacy)
   📋 CheMed123: CheMed (medical_equipment)

🎯 Channels selected for scraping: ['lobelia4cosmetics', 'tikvahpharma', 'CheMed123']


In [6]:
# Initialize Telegram Client
client = TelegramClient("anon", API_ID, API_HASH)

# Start the client asynchronously
async def start_client():
    await client.start()
    me = await client.get_me()
    logger.info("Telegram client started successfully")
    print("✅ Client started successfully!")
    print(f"👤 Connected as: {me.first_name}")
    return client

# Run the async function
await start_client()

print(f"🎯 Ready to scrape {len(channels_to_scrape)} verified channels")

2025-07-13 23:16:44,009 - telegram_scraper - INFO - Telegram client started successfully


Signed in successfully as Emnet; remember to not break the ToS or you will risk an account ban!
✅ Client started successfully!
👤 Connected as: Emnet
🎯 Ready to scrape 3 verified channels


In [7]:
async def download_media(message, channel_name, images_path):
    """Download images and media from messages"""
    media_info = []
    
    try:
        if message.media:
            if isinstance(message.media, (MessageMediaPhoto, MessageMediaDocument)):
                # Create channel-specific directory
                channel_images_path = images_path / channel_name
                channel_images_path.mkdir(exist_ok=True)
                
                # Generate filename
                timestamp = message.date.strftime("%Y%m%d_%H%M%S")
                filename = f"{channel_name}_{message.id}_{timestamp}"
                
                # Download media
                try:
                    path = await client.download_media(
                        message.media, 
                        file=str(channel_images_path / filename)
                    )
                    if path:
                        media_info.append({
                            'type': 'photo' if isinstance(message.media, MessageMediaPhoto) else 'document',
                            'filename': os.path.basename(path),
                            'path': str(path),
                            'size': os.path.getsize(path) if os.path.exists(path) else 0
                        })
                        logger.debug(f"Downloaded media: {path}")
                except Exception as e:
                    logger.warning(f"Failed to download media for message {message.id}: {e}")
                    
    except Exception as e:
        logger.error(f"Failed to process media for message {message.id}: {e}")
    
    return media_info

async def scrape_channel_messages(channel_username, limit=1000):
    """Enhanced channel scraping with image collection and better data structure"""
    logger.info(f"Starting to scrape channel: {channel_username}")
    
    try:
        # Get the channel entity
        channel = await client.get_entity(channel_username)
        channel_info = {
            'username': channel_username,
            'title': channel.title,
            'id': channel.id,
            'participants_count': getattr(channel, 'participants_count', None),
            'description': getattr(channel, 'about', None)
        }
        
        print(f"🔄 Scraping channel: {channel.title}")
        logger.info(f"Channel info: {channel_info}")
        
        # Get messages with enhanced data collection
        messages = []
        media_count = 0
        
        async for message in client.iter_messages(channel, limit=limit):
            # Collect media if present
            media_info = await download_media(message, channel_username, images_dir)
            if media_info:
                media_count += len(media_info)
            
            # Enhanced message data structure
            message_data = {
                'id': message.id,
                'date': message.date.isoformat(),
                'text': message.text,
                'views': message.views,
                'forwards': message.forwards,
                'replies': message.replies.replies if message.replies else 0,
                'reactions': getattr(message, 'reactions', None),
                'media': media_info,
                'has_media': bool(message.media),
                'channel': channel_username,
                'channel_info': channel_info,
                'scraped_at': datetime.now().isoformat()
            }
            messages.append(message_data)
        
        # Save messages to partitioned structure
        filename = messages_dir / f"{channel_username}_{today}.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump({
                'channel_info': channel_info,
                'scrape_metadata': {
                    'scraped_at': datetime.now().isoformat(),
                    'total_messages': len(messages),
                    'media_files': media_count,
                    'scraper_version': '2.0'
                },
                'messages': messages
            }, f, ensure_ascii=False, indent=2)
        
        logger.info(f"Successfully scraped {len(messages)} messages from {channel_username}")
        print(f"✅ Scraped {len(messages)} messages, {media_count} media files")
        print(f"💾 Saved to: {filename}")
        
        return messages, media_count
        
    except Exception as e:
        logger.error(f"Error scraping {channel_username}: {str(e)}")
        print(f"❌ Error scraping {channel_username}: {str(e)}")
        return [], 0

# Test with one channel first
print("🧪 Testing with lobelia4cosmetics channel...")
test_messages, test_media = await scrape_channel_messages("lobelia4cosmetics", limit=50)
print(f"✅ Test completed: {len(test_messages)} messages, {test_media} media files")

2025-07-13 23:18:52,978 - telegram_scraper - INFO - Starting to scrape channel: lobelia4cosmetics


🧪 Testing with lobelia4cosmetics channel...


2025-07-13 23:18:53,193 - telegram_scraper - INFO - Channel info: {'username': 'lobelia4cosmetics', 'title': 'Lobelia pharmacy and cosmetics', 'id': 1666492664, 'participants_count': None, 'description': None}


🔄 Scraping channel: Lobelia pharmacy and cosmetics


2025-07-13 23:19:23,670 - telegram_scraper - INFO - Successfully scraped 50 messages from lobelia4cosmetics


✅ Scraped 50 messages, 50 media files
💾 Saved to: ..\data\raw\telegram_messages\2025-07-13\lobelia4cosmetics_2025-07-13.json
✅ Test completed: 50 messages, 50 media files
