# Task 1: Data Ingestion and  Data Preprocessing

In [19]:
import asyncio
import nest_asyncio
import pandas as pd
from telethon import TelegramClient
from datetime import datetime, timedelta
import re
import os
import sys
import matplotlib.pyplot as plt 
import seaborn as sns   
sys.path.append(os.path.abspath("../scripts"))
import warnings
warnings.filterwarnings('ignore')

In [4]:
os.chdir("..") 

In [15]:
from config import api_id , api_hash, session_name, cutoff_date, channels

In [16]:
session_file = session_name + '.session'
if os.path.exists(session_file):
    os.remove(session_file)
    print(f"Deleted old session file {session_file}")

In [20]:
# 7. Amharic text cleaning function
def clean_amharic(text):
    if not text:
        return ""
    # Keep only Amharic unicode block and spaces
    text = re.sub(r'[^\u1200-\u137F\s]', '', text)
    # Normalize multiple spaces to single space
    return re.sub(r'\s+', ' ', text).strip()

# 8. Initialize Telegram client
client = TelegramClient(session_name, api_id, api_hash)

# 9. Async function to fetch and save data
async def fetch_and_save_messages():
    await client.start()
    print("✅ Telegram client started.")

    all_records = []

    for channel in channels:
        print(f" Fetching from channel: {channel}")
        try:
            async for msg in client.iter_messages(channel, offset_date=cutoff_date, limit=100 ):
                text = clean_amharic(msg.message or "")
                record = {
                    'channel': channel,
                    'timestamp': msg.date.strftime('%Y-%m-%d %H:%M:%S'),
                    'sender_id': msg.sender_id,
                    'text': text,

                }
                all_records.append(record)
        except Exception as e:
            print(f" Error fetching {channel}: {e}")

    # Create DataFrame and save to CSV
    df = pd.DataFrame(all_records)
    csv_path = "Data/telegram_amharic_messages.csv"
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"Saved {len(df)} messages to {csv_path}")

    # Show Output data
    print(" Output data:")
    print(df.head(10))

# 10. Run the async function in notebook environment
await fetch_and_save_messages()

✅ Telegram client started.
 Fetching from channel: EthioMart
 Fetching from channel: forfreemarket
 Fetching from channel: helloomarketethiopia
 Fetching from channel: classybrands
 Fetching from channel: kuruwear
Saved 500 messages to Data/telegram_amharic_messages.csv
 Output data:
     channel            timestamp      sender_id text
0  EthioMart  2022-02-07 21:05:18 -1001204279885     
1  EthioMart  2022-02-07 21:05:18 -1001204279885     
2  EthioMart  2022-01-22 19:24:00 -1001204279885     
3  EthioMart  2022-01-22 19:24:00 -1001204279885     
4  EthioMart  2021-07-13 13:29:27 -1001204279885     
5  EthioMart  2021-07-04 21:58:12 -1001204279885     
6  EthioMart  2021-07-04 21:53:26 -1001204279885     
7  EthioMart  2021-07-03 19:02:06 -1001204279885     
8  EthioMart  2021-07-03 19:02:06 -1001204279885     
9  EthioMart  2021-07-03 19:01:59 -1001204279885     
