# Task 1: Telegram Data Scraping
This notebook demonstrates how to use the project scraping functions to extract messages and images from Telegram channels and store them in the data lake.

In [None]:
# If running for the first time, install dependencies in the notebook
# !pip install telethon python-dotenv loguru pandas matplotlib nest_asyncio
# !pip install -r ../requirements.txt


In [11]:
import sys
sys.path.append('../src')  # Ensure src is in the path
import nest_asyncio
nest_asyncio.apply()
from scrape_telegram import scrape_telegram_channels


## Define the channels to scrape and run the scraper

In [None]:
channels = [
    'https://t.me/lobelia4cosmetics',
    'https://t.me/tikvahpharma',
    # Add more channels as needed
]
# Optionally set a custom date string or message limit
date_str = None  # e.g., '2025-07-25'
limit = 100      # Number of messages per channel

# Run the async scraping function from a notebook
await scrape_telegram_channels(channels, date_str=date_str, limit=limit)


---
- Raw data will be saved in `data/raw/telegram_messages/YYYY-MM-DD/channel_name.json`.
- Images will be saved in a subfolder for each channel and day.
- Check the logs for scraping progress and errors.


## Preview the scrape log and most recent data

In [None]:
import os, json, glob
from datetime import datetime
# Load scrape log
scrape_log_path = '../data/raw/scrape_log.json'
if os.path.exists(scrape_log_path):
    with open(scrape_log_path, 'r', encoding='utf-8') as f:
        scrape_log = json.load(f)
    print('Scrape log:')
    for channel, dates in scrape_log.items():
        print(f'Channel: {channel}')
        for date, info in dates.items():
            print(f'  {date}: {info}')
else:
    print('No scrape log found.')

# Preview the most recent scraped data
data_dir = '../data/raw/telegram_messages'
if os.path.exists(data_dir):
    all_json = glob.glob(f'{data_dir}/**/*.json', recursive=True)
    if all_json:
        latest = max(all_json, key=os.path.getmtime)
        print(f'Previewing: {latest}')
        with open(latest, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f'Number of messages: {len(data)}')
        for msg in data[:3]:
            print(json.dumps(msg, indent=2)[:500])
    else:
        print('No scraped data found.')
else:
    print('No data directory found.')


## EDA: Message Counts, Types, and Activity

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob, os, json
from datetime import datetime

# Gather all scraped JSON files
data_dir = '../data/raw/telegram_messages'
all_json = glob.glob(f'{data_dir}/**/*.json', recursive=True)
records = []
for path in all_json:
    channel = os.path.basename(path).replace('.json', '')
    date = os.path.basename(os.path.dirname(path))
    with open(path, 'r', encoding='utf-8') as f:
        msgs = json.load(f)
        for m in msgs:
            m['channel'] = channel
            m['date'] = date
            records.append(m)
if records:
    df = pd.DataFrame(records)
    print(f'Total messages: {len(df)}')
    # Message count per channel/date
    msg_counts = df.groupby(['channel', 'date']).size().unstack(fill_value=0)
    msg_counts.plot(kind='bar', stacked=True, figsize=(10,4))
    plt.title('Message Count per Channel/Date')
    plt.ylabel('Messages')
    plt.show()
    # Message type distribution
    def msg_type(row):
        if row.get('media') and row['media'] is not None:
            return 'image' if 'photo' in str(row['media']).lower() else 'other_media'
        elif row.get('message'):
            return 'text'
        else:
            return 'other'
    df['msg_type'] = df.apply(msg_type, axis=1)
    df['msg_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(5,5))
    plt.title('Message Type Distribution')
    plt.ylabel('')
    plt.show()
    # Posting activity over time
    if 'date' in df.columns and 'msg_type' in df.columns:
        df['date_dt'] = pd.to_datetime(df['date'], errors='coerce')
        activity = df.groupby(['date_dt', 'channel']).size().unstack(fill_value=0)
        activity.plot(figsize=(10,4))
        plt.title('Posting Activity Over Time')
        plt.ylabel('Messages')
        plt.show()
else:
    print('No records to analyze.')


## Display Sample Images

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import glob
# Find a few sample images
img_dirs = glob.glob(f'{data_dir}/**/*_images', recursive=True)
sample_imgs = []
for d in img_dirs:
    imgs = glob.glob(f'{d}/*.jpg')
    sample_imgs.extend(imgs)
# Show up to 4 images
for i, img_path in enumerate(sample_imgs[:4]):
    img = mpimg.imread(img_path)
    plt.figure()
    plt.imshow(img)
    plt.title(os.path.basename(img_path))
    plt.axis('off')
    plt.show()
