# Import independecies

In [4]:
# notebooks/01_data_ingestion_preprocessing.ipynb

import os
import sys
import asyncio
import json
from datetime import datetime
import pandas as pd
from telethon import TelegramClient, events
from telethon.tl.types import MessageMediaPhoto, DocumentAttributeFilename
from telethon.tl.functions.channels import GetParticipantsRequest
from telethon.tl.types import ChannelParticipantsSearch
from tqdm.notebook import tqdm


# Project root

In [5]:

# --- Project Setup: Ensure src module is discoverable ---
# This block is crucial for importing from src.config
def find_project_root(current_path):
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables
from src.config import (
    TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_CHANNELS,
    RAW_MESSAGES_JSON, CLEAN_MESSAGES_CSV, IMAGE_DOWNLOAD_DIR,
    RAW_DATA_DIR
)

'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor' already in sys.path.


# Data Ingestiona and Preprocessing

*Telegram API client setup*

In [None]:
# Telegram API Client Setup ---
# A session name is used to save login information so you don't have to log in every time
session_name = 'telegram_scraper_session'
client = TelegramClient(session_name, TELEGRAM_API_ID, TELEGRAM_API_HASH)

async def connect_and_authenticate():
    """Connects to Telegram and authenticates the user."""
    print("\n--- Connecting to Telegram... ---")
    await client.start()
    if not await client.is_user_authorized():
        print("Please log in to your Telegram account.")
        # This will prompt for phone number and code in the console if not already logged in
        await client.send_code_request(phone=input('Enter phone number: '))
        await client.sign_in(phone=input('Enter phone number: '), code=input('Enter code: '))
    print("Connected and authenticated with Telegram successfully.")


# -- Data Ingestion System (Scraping Messages) ---
async def scrape_telegram_channels():
    """Scrapes messages (text and media) from configured Telegram channels."""
    all_messages_data = []

    print("\n--- Initiating Telegram Channel Scraping ---")
    
    await connect_and_authenticate()

    for channel_id_or_username in TELEGRAM_CHANNELS:
        print(f"\nProcessing channel: {channel_id_or_username}")
        try:
            entity = await client.get_entity(channel_id_or_username)
            print(f"Resolved channel: {entity.title} (ID: {entity.id})")

            # Iterate over messages (e.g., last 1000 messages)
            # You can adjust the limit or use client.iter_messages() for more granular control
            # Note: For very large channels, consider fetching in chunks or over a specific date range.
            messages_iterator = client.iter_messages(entity, limit=1000) # Fetch up to 2000 messages per channel

            pbar_desc = f"Scraping {entity.title[:20]}..." # Truncate for display
            async for message in tqdm(messages_iterator, desc=pbar_desc, unit="msg"):
                msg_data = {
                    'message_id': message.id,
                    'channel_id': entity.id,
                    'channel_name': entity.title,
                    'sender_id': message.sender_id,
                    'date': message.date.isoformat(),
                    'text_content': message.message,
                    'has_media': False,
                    'media_type': None,
                    'media_file_name': None,
                    'media_file_path': None,
                    'views': message.views # Engagement metric
                }

                if message.media:
                    msg_data['has_media'] = True
                    if isinstance(message.media, MessageMediaPhoto):
                        msg_data['media_type'] = 'photo'
                        # Download photo
                        photo_filename = f"channel_{entity.id}_msg_{message.id}_photo.jpg"
                        photo_filepath = os.path.join(IMAGE_DOWNLOAD_DIR, photo_filename)
                        try:
                            await client.download_media(message.media, file=photo_filepath)
                            msg_data['media_file_name'] = photo_filename
                            msg_data['media_file_path'] = photo_filepath
                        except Exception as e:
                            print(f"  Warning: Could not download photo for message {message.id} in {entity.title}: {e}")
                            msg_data['media_file_name'] = 'download_failed'
                    elif message.document: # Handle other documents like files
                        for attr in message.document.attributes:
                            if isinstance(attr, DocumentAttributeFilename):
                                msg_data['media_type'] = 'document'
                                doc_filename = f"channel_{entity.id}_msg_{message.id}_{attr.file_name}"
                                doc_filepath = os.path.join(IMAGE_DOWNLOAD_DIR, doc_filename)
                                try:
                                    await client.download_media(message.media, file=doc_filepath)
                                    msg_data['media_file_name'] = doc_filename
                                    msg_data['media_file_path'] = doc_filepath
                                except Exception as e:
                                    print(f"  Warning: Could not download document for message {message.id} in {entity.title}: {e}")
                                    msg_data['media_file_name'] = 'download_failed'
                                break
                all_messages_data.append(msg_data)

        except ValueError as e:
            print(f"ERROR: Could not find channel/entity '{channel_id_or_username}'. Please check its exact username or ID. Error: {e}")
        except Exception as e:
            print(f"ERROR: An unexpected error occurred while processing '{channel_id_or_username}': {e}")
    
    await client.disconnect()
    return all_messages_data


# Run the scraping process
# Use asyncio.run() to run the async function
messages_list = await scrape_telegram_channels() # In a Jupyter notebook, 'await' works directly at top-level

if not messages_list:
    print("\nCRITICAL WARNING: No messages were collected from any channel. Please check channel configurations and API keys.")
    raw_df = pd.DataFrame()
else:
    print(f"\n--- Raw Message Collection Summary ---")
    raw_df = pd.DataFrame(messages_list)
    print(f"Total raw messages collected: {len(raw_df)}")
    print("Raw DataFrame Info:")
    raw_df.info()
    print("\nFirst 5 rows of Raw Messages:")
    print(raw_df.head())
    print(f"Saving raw messages to: {RAW_MESSAGES_JSON}")
    # Save as JSON as it's easier to store complex dicts, especially if media info gets complicated
    with open(RAW_MESSAGES_JSON, 'w', encoding='utf-8') as f:
        json.dump(messages_list, f, ensure_ascii=False, indent=4)
    print("Raw messages saved successfully.")




--- Initiating Telegram Channel Scraping ---

--- Connecting to Telegram... ---
Connected and authenticated with Telegram successfully.

Processing channel: @nevacomputer
Resolved channel: NEVA COMPUTER® (ID: 1195361398)
ERROR: An unexpected error occurred while processing '@nevacomputer': IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


Exception ignored in: <function tqdm.__del__ at 0x000001CE82243F60>
Traceback (most recent call last):
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'



Processing channel: @marakibrand


Exception ignored in: <function tqdm.__del__ at 0x000001CE82243F60>
Traceback (most recent call last):
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Resolved channel: ማራኪ ცЯﾑŋの™ (ID: 1320403852)
ERROR: An unexpected error occurred while processing '@marakibrand': IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

Processing channel: @Fashiontera


Exception ignored in: <function tqdm.__del__ at 0x000001CE82243F60>
Traceback (most recent call last):
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Resolved channel: Fashion tera (ID: 1175527648)
ERROR: An unexpected error occurred while processing '@Fashiontera': IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

Processing channel: @Shewabrand


Exception ignored in: <function tqdm.__del__ at 0x000001CE82243F60>
Traceback (most recent call last):
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Resolved channel: Shewa Brand (ID: 1237900032)
ERROR: An unexpected error occurred while processing '@Shewabrand': IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

Processing channel: @ethio_brand_collection


Exception ignored in: <function tqdm.__del__ at 0x000001CE82243F60>
Traceback (most recent call last):
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor\.venv\Lib\site-packages\tqdm\notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Resolved channel: EthioBrand® (ID: 1149977975)
ERROR: An unexpected error occurred while processing '@ethio_brand_collection': IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

