# Import independecies

In [None]:
# notebooks/01_data_ingestion_preprocessing.ipynb

import os
import sys
import asyncio
import json
from datetime import datetime
import pandas as pd
from telethon import TelegramClient, events
from telethon.tl.types import MessageMediaPhoto, DocumentAttributeFilename
from telethon.tl.functions.channels import GetParticipantsRequest
from telethon.tl.types import ChannelParticipantsSearch
from tqdm.notebook import tqdm
import re 


# Project root

In [2]:

# --- Project Setup: Ensure src module is discoverable ---
# This block is crucial for importing from src.config
def find_project_root(current_path):
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables
from src.config import (
    TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_CHANNELS,
    RAW_MESSAGES_JSON, CLEAN_MESSAGES_CSV, IMAGE_DOWNLOAD_DIR,
    RAW_DATA_DIR
)

Added 'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM4\amharic-ecommerce-data-extractor' to sys.path for module imports.
Project configuration loaded. Data directories created/verified.
Telegram Channels configured: ['@nevacomputer', '@marakibrand', '@Fashiontera', '@Shewabrand', '@ethio_brand_collection']


# Data Ingestiona and Preprocessing

*Telegram API client setup*

In [3]:

print("\n-Data Ingestion and Preprocessing ---")

# --- Step 1: Telegram API Client Setup ---
# A session name is used to save login information so you don't have to log in every time
session_name = 'telegram_scraper_session'
client = TelegramClient(session_name, TELEGRAM_API_ID, TELEGRAM_API_HASH)

async def connect_and_authenticate():
    """Connects to Telegram and authenticates the user."""
    print("\n--- Connecting to Telegram... ---")
    await client.start()
    if not await client.is_user_authorized():
        print("Please log in to your Telegram account.")
        # This will prompt for phone number and code in the console if not already logged in
        await client.send_code_request(phone=input('Enter phone number: '))
        await client.sign_in(phone=input('Enter phone number: '), code=input('Enter code: '))
    print("Connected and authenticated with Telegram successfully.")

# --- Step 2: Data Ingestion System (Scraping Messages) ---
async def scrape_telegram_channels():
    """Scrapes messages (text and media) from configured Telegram channels."""
    all_messages_data = []

    print("\n--- Initiating Telegram Channel Scraping ---")
    
    await connect_and_authenticate()

    for channel_id_or_username in TELEGRAM_CHANNELS:
        print(f"\nProcessing channel: {channel_id_or_username}")
        try:
            entity = await client.get_entity(channel_id_or_username)
            print(f"Resolved channel: {entity.title} (ID: {entity.id})")

            # Fetch messages into a list first, then iterate with tqdm
            # This resolves the 'async for requires an object with __aiter__' error
            # as tqdm.notebook.tqdm expects a synchronous iterable.
            # You can adjust the limit for number of messages to fetch per channel.
            # For very large channels, consider fetching in smaller chunks or over a specific date range.
            messages_fetched = await client.get_messages(entity, limit=500) 

            pbar_desc = f"Scraping {entity.title[:20]}..." # Truncate for display
            for message in tqdm(messages_fetched, desc=pbar_desc, unit="msg"):
                msg_data = {
                    'message_id': message.id,
                    'channel_id': entity.id,
                    'channel_name': entity.title,
                    'sender_id': message.sender_id,
                    'date': message.date.isoformat(),
                    'text_content': message.message,
                    'has_media': False,
                    'media_type': None,
                    'media_file_name': None,
                    'media_file_path': None,
                    'views': message.views # Engagement metric
                }

                if message.media:
                    msg_data['has_media'] = True
                    if isinstance(message.media, MessageMediaPhoto):
                        msg_data['media_type'] = 'photo'
                        # Download photo
                        photo_filename = f"channel_{entity.id}_msg_{message.id}_photo.jpg"
                        photo_filepath = os.path.join(IMAGE_DOWNLOAD_DIR, photo_filename)
                        try:
                            await client.download_media(message.media, file=photo_filepath)
                            msg_data['media_file_name'] = photo_filename
                            msg_data['media_file_path'] = photo_filepath
                        except Exception as e:
                            print(f"  Warning: Could not download photo for message {message.id} in {entity.title}: {e}")
                            msg_data['media_file_name'] = 'download_failed'
                    elif message.document: # Handle other documents like files
                        for attr in message.document.attributes:
                            if isinstance(attr, DocumentAttributeFilename):
                                msg_data['media_type'] = 'document'
                                doc_filename = f"channel_{entity.id}_msg_{message.id}_{attr.file_name}"
                                doc_filepath = os.path.join(IMAGE_DOWNLOAD_DIR, doc_filename)
                                try:
                                    await client.download_media(message.media, file=doc_filepath)
                                    msg_data['media_file_name'] = doc_filename
                                    msg_data['media_file_path'] = doc_filepath
                                except Exception as e:
                                    print(f"  Warning: Could not download document for message {message.id} in {entity.title}: {e}")
                                    msg_data['media_file_name'] = 'download_failed'
                                break
                all_messages_data.append(msg_data)

        except ValueError as e:
            print(f"ERROR: Could not find channel/entity '{channel_id_or_username}'. Please check its exact username or ID. Error: {e}")
        except Exception as e:
            print(f"ERROR: An unexpected error occurred while processing '{channel_id_or_username}': {e}")
    
    await client.disconnect()
    return all_messages_data

# Run the scraping process
# Use asyncio.run() to run the async function
messages_list = await scrape_telegram_channels() # In a Jupyter notebook, 'await' works directly at top-level

if not messages_list:
    print("\nCRITICAL WARNING: No messages were collected from any channel. Please check channel configurations and API keys.")
    raw_df = pd.DataFrame()
else:
    print(f"\n--- Raw Message Collection Summary ---")
    raw_df = pd.DataFrame(messages_list)
    print(f"Total raw messages collected: {len(raw_df)}")
    print("Raw DataFrame Info:")
    raw_df.info()
    print("\nFirst 5 rows of Raw Messages:")
    print(raw_df.head())
    print(f"Saving raw messages to: {RAW_MESSAGES_JSON}")
    # Save as JSON as it's easier to store complex dicts, especially if media info gets complicated
    with open(RAW_MESSAGES_JSON, 'w', encoding='utf-8') as f:
        json.dump(messages_list, f, ensure_ascii=False, indent=4)
    print("Raw messages saved successfully.")



-Data Ingestion and Preprocessing ---

--- Initiating Telegram Channel Scraping ---

--- Connecting to Telegram... ---
Connected and authenticated with Telegram successfully.

Processing channel: @nevacomputer
Resolved channel: NEVA COMPUTER® (ID: 1195361398)


Scraping NEVA COMPUTER®...:   0%|          | 0/500 [00:00<?, ?msg/s]


Processing channel: @marakibrand
Resolved channel: ማራኪ ცЯﾑŋの™ (ID: 1320403852)


Scraping ማራኪ ცЯﾑŋの™...:   0%|          | 0/500 [00:00<?, ?msg/s]


Processing channel: @Fashiontera
Resolved channel: Fashion tera (ID: 1175527648)


Scraping Fashion tera...:   0%|          | 0/500 [00:00<?, ?msg/s]


Processing channel: @Shewabrand
Resolved channel: Shewa Brand (ID: 1237900032)


Scraping Shewa Brand...:   0%|          | 0/500 [00:00<?, ?msg/s]


Processing channel: @ethio_brand_collection
Resolved channel: EthioBrand® (ID: 1149977975)


Scraping EthioBrand®...:   0%|          | 0/500 [00:00<?, ?msg/s]


--- Raw Message Collection Summary ---
Total raw messages collected: 2500
Raw DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       2500 non-null   int64  
 1   channel_id       2500 non-null   int64  
 2   channel_name     2500 non-null   object 
 3   sender_id        2500 non-null   int64  
 4   date             2500 non-null   object 
 5   text_content     2498 non-null   object 
 6   has_media        2500 non-null   bool   
 7   media_type       2482 non-null   object 
 8   media_file_name  2482 non-null   object 
 9   media_file_path  2482 non-null   object 
 10  views            2498 non-null   float64
dtypes: bool(1), float64(1), int64(3), object(6)
memory usage: 197.9+ KB

First 5 rows of Raw Messages:
   message_id  channel_id    channel_name      sender_id  \
0        8779  1195361398  NEVA COMP

# Preprocessing and Structuring data

In [4]:
                                 
# --- Step 3: Preprocessing and Structuring the Data ---
print("\n--- Initiating Data Preprocessing and Structuring ---")

if raw_df.empty:
    print("Raw DataFrame is empty. Skipping preprocessing.")
    clean_df = pd.DataFrame(columns=[
        'message_id', 'channel_id', 'channel_name', 'date',
        'text_content', 'has_media', 'media_type', 'media_file_name',
        'views', 'processed_text' # New column for cleaned Amharic text
    ])
else:
    clean_df = raw_df.copy()

    # Handle missing text content: Fill None with empty string for NLP
    clean_df['text_content'] = clean_df['text_content'].fillna('')

    # Convert date to datetime object and extract date only
    clean_df['date'] = pd.to_datetime(clean_df['date']).dt.date

    # --- Amharic-specific Text Preprocessing (Basic) ---
    # For more advanced Amharic NLP, you'd integrate libraries like Ethiopic, AmharicNLP, or custom tokenizers/normalizers.
    # For now, we'll do a general cleaning.

    def preprocess_amharic_text(text):
        text = str(text).lower() # Convert to string and lowercase
        # Remove URLs (common in Telegram posts)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove Telegram-specific noise (e.g., @mentions, #hashtags if not relevant to product, channel mentions)
        text = re.sub(r'@\w+|#\w+|t.me/\S+', '', text)
        # Remove non-Amharic characters (excluding basic punctuation if useful, but simpler to remove all non-alphanumeric)
        # This regex keeps Amharic characters, spaces, and basic numbers/punctuation (which might be relevant for price/location initially)
        # However, for pure text processing, you might only keep Amharic characters.
        # \u1200-\u137F covers the main Ethiopic script range.
        text = re.sub(r'[^\u1200-\u137F\s\d.,:;!?]+', '', text) # Keep Amharic letters, numbers, basic punctuation, spaces
        text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
        return text

    print("\nApplying Amharic-specific text preprocessing...")
    clean_df['processed_text'] = clean_df['text_content'].apply(preprocess_amharic_text)

    # Check for empty processed texts
    empty_processed_texts = clean_df[clean_df['processed_text'].str.strip() == '']
    if not empty_processed_texts.empty:
        print(f"WARNING: {len(empty_processed_texts)} messages became empty after Amharic text preprocessing.")
        # Decide whether to drop these or keep them for their metadata/media. For now, we keep them.

    # Drop messages with no meaningful text and no media, as they won't contribute to NER
    initial_count = len(clean_df)
    clean_df = clean_df[~((clean_df['processed_text'] == '') & (clean_df['has_media'] == False))]
    if len(clean_df) < initial_count:
        print(f"Dropped {initial_count - len(clean_df)} rows that had no text content and no media.")

    # Select and reorder columns for the final cleaned CSV
    clean_df = clean_df[[
        'message_id', 'channel_id', 'channel_name', 'date',
        'text_content', 'processed_text', 'has_media', 'media_type', 'media_file_name', 'views'
    ]]

    print(f"\n--- Preprocessing Summary ---")
    print(f"Total clean messages after preprocessing: {len(clean_df)}")
    print("Clean DataFrame Info:")
    clean_df.info()
    print("\nFirst 5 rows of Clean Messages:")
    print(clean_df.head())

    print(f"Saving clean messages to: {CLEAN_MESSAGES_CSV}")
    clean_df.to_csv(CLEAN_MESSAGES_CSV, index=False, encoding='utf-8')

print("\n--- Task 1: Data Ingestion and Preprocessing Complete ---")
print("Remember to commit your work to your 'task-1' branch!")
                                                           


--- Initiating Data Preprocessing and Structuring ---

Applying Amharic-specific text preprocessing...


NameError: name 're' is not defined