In [1]:
import os
import asyncio
from dotenv import load_dotenv
from telethon.sync import TelegramClient
from telethon.tl.types import MessageMediaPhoto
import nest_asyncio
from datetime import datetime

# This is needed for running async telethon code in a Jupyter Notebook
nest_asyncio.apply()


In [2]:
# --- Load Credentials ---

load_dotenv()

API_ID = os.getenv('TELEGRAM_API_ID')
API_HASH = os.getenv('TELEGRAM_API_HASH')
SESSION_NAME = os.getenv('TELEGRAM_SESSION_NAME', 'my_notebook_session')

# Check if credentials were loaded
if not all([API_ID, API_HASH]):
    print("ERROR: Could not find TELEGRAM_API_ID or TELEGRAM_API_HASH.")
    print("Please make sure you have a .env file with these values.")
else:
    print("Credentials loaded successfully.")


Credentials loaded successfully.


In [3]:
# --- Step 4: Define the scraping function ---

async def scrape_channel_fully(client, channel_username, limit=50):
    print(f"Scraping messages from {channel_username}...")
    
    # 1. DEFINE THE CORRECT PARTITIONED DIRECTORY PATH
    today_str = datetime.utcnow().strftime('%Y-%m-%d')
    # This creates the full path like: data/raw/telegram_messages/2025-07-15/lobelia4cosmetics
    output_dir = os.path.join("data", "raw", "telegram_messages", today_str, channel_username)
    os.makedirs(output_dir, exist_ok=True) # Create the directories if they don't exist
    
    messages = await client.get_messages(channel_username, limit=limit)
    
    print(f"Found {len(messages)} messages. Processing and saving...")
    
    for message in messages:
        # Type hint for clarity
        message: Message
        
        # 2. CONVERT THE ENTIRE MESSAGE TO A DICTIONARY TO CAPTURE ALL DATA
        message_data = message.to_dict()
        
        # 3. HANDLE IMAGE DOWNLOADS
        # Check if there's a photo and download it to the correct directory
        if message.photo:
            # Define where the image will be saved
            image_save_path = os.path.join(output_dir, f"{message.id}.jpg")
            print(f"  - Downloading image for message {message.id} to {image_save_path}")
            try:
                await message.download_media(file=image_save_path)
            except Exception as e:
                print(f"  - [ERROR] Could not download image: {e}")
        
        # 4. SAVE THE MESSAGE METADATA AS A JSON FILE
        # This is the most critical missing piece.
        json_file_path = os.path.join(output_dir, f"{message.id}.json")
        print(f"  - Saving metadata for message {message.id} to {json_file_path}")
        try:
            with open(json_file_path, 'w', encoding='utf-8') as f:
                # 'default=str' helps handle data types that aren't naturally JSON-friendly
                json.dump(message_data, f, ensure_ascii=False, indent=4, default=str)
        except Exception as e:
            print(f"  - [ERROR] Could not save JSON file: {e}")

    print(f"Finished scraping {channel_username}.\n")

In [4]:
async def main():
    client = TelegramClient(SESSION_NAME, API_ID, API_HASH)
    
    await client.start()
    print("Client Connected.")
    
    # Scrape all desired channels
    channels_to_scrape = ["lobelia4cosmetics", "tikvahpharma", "ChemedET"]
    for channel in channels_to_scrape:
        await scrape_channel_fully(client, channel, limit=50)

    await client.disconnect()
    print("Client Disconnected.")

In [5]:
# --- Step 6: Run the main function ---
# This will execute the entire process.

loop = asyncio.get_event_loop()
loop.run_until_complete(main())

KeyboardInterrupt: Interrupted by user