In [1]:
# Telegram Channel Scraping Notebook
import sys
import os
import asyncio
import nest_asyncio
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path

# Import our scraper module (from scripts directory)
sys.path.append(str(Path.cwd().parent / "scripts"))
from scripts.telegram_scraper import TelegramScraper

# Apply nest_asyncio for Jupyter
nest_asyncio.apply()

# Load environment variables (if using .env)
load_dotenv()

# Configuration setup
CONFIG_PATH = str(Path.cwd().parent / "config" / "telegram_credentials.ini")
OUTPUT_DIR = str(Path.cwd().parent / "data" / "raw" / "telegram_data")

# Define target channels
CHANNELS = [
    '@ZemenExpress',
    '@nevacomputer',
    '@meneshayeofficial',
    '@ethio_brand_collection',
    '@Leyueqa',
    '@sinayelj',
    '@Shewabrand',
    '@helloomarketethiopia',
    '@modernshoppingcenter',
    '@Fashiontera',
    '@kuruwear',
    '@gebeyaadama',
    '@MerttEka',
    '@forfreemarket',
    '@classybrands',
    '@marakibrand',
    '@aradabrand2',
    '@marakisat2',
    '@belaclassic',
    '@AwasMart',
    '@qnashcom'
]



ModuleNotFoundError: No module named 'pandas'

In [None]:
import sys
print(sys.executable)

In [None]:
# Initialize scraper with custom paths
scraper = TelegramScraper(config_path=CONFIG_PATH)
scraper.data_dir = OUTPUT_DIR  # Set custom output directory

# Start client
await scraper.start_client()
print("Client connected successfully")

# Scrape all channels
results = {}
for channel in CHANNELS:
    try:
        print(f"\n{'='*50}\nScraping: {channel}\n{'='*50}")
        results[channel] = await scraper.scrape_channel(channel, limit=100)
    except Exception as e:
        print(f"❌ Failed to scrape {channel}: {str(e)}")

# Close connection
await scraper.close()

In [None]:
# Analysis Section
print("\n\nScraping Summary:")
for channel, data in results.items():
    if data:
        print(f"- {channel}: {len(data)} messages")

# Combine all data for analysis
all_messages = []
for channel, messages in results.items():
    if messages:
        for msg in messages:
            msg['channel'] = channel
            all_messages.append(msg)

df = pd.DataFrame(all_messages)

# Basic Analysis
print("\nMessage Statistics:")
print(f"Total messages: {len(df)}")
print(f"Unique channels: {df['channel'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

# Save combined dataset
combined_path = Path(OUTPUT_DIR) / "combined_messages.csv"
df.to_csv(combined_path, index=False)
print(f"\nSaved combined dataset to: {combined_path}")

# Visualization Example
import matplotlib.pyplot as plt

# Messages per channel
channel_counts = df['channel'].value_counts().sort_values(ascending=True)
channel_counts.plot(kind='barh', figsize=(10, 8))
plt.title('Messages per Channel')
plt.xlabel('Message Count')
plt.tight_layout()
plt.savefig(Path(OUTPUT_DIR) / 'messages_per_channel.png')
plt.show()

# Media type distribution
if 'media_type' in df.columns:
    media_counts = df['media_type'].value_counts()
    media_counts.plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8))
    plt.title('Media Type Distribution')
    plt.ylabel('')
    plt.tight_layout()
    plt.savefig(Path(OUTPUT_DIR) / 'media_type_distribution.png')
    plt.show()