# 01 – Telegram Signal Extraction

This notebook is responsible for collecting, parsing, cleaning, and updating cryptocurrency trading signal messages from a Telegram group.

It serves as the **data ingestion and preprocessing entry point** for the entire project. The main objectives of this notebook are:

## Purpose
- Retrieve new messages from a Telegram signals group using the **Telethon** client.
- **Parse** each message to extract relevant trading data (symbol, direction, entry price, take-profit targets, profit updates).
- **Clean** the parsed data by removing malformed or incomplete entries.
- **Append** the newly extracted data to the existing cleaned CSV files.
- Maintain a centralized and **up-to-date historical record** of both full trade signals and hit price updates.

## Key Features
- Loads API credentials from a `.env` file.
- Automatically detects and processes only new messages using `offset_date` logic.
- Classifies messages into two categories:
  - `signal_data`: messages that contain full trade entries with take-profit targets.
  - `update_data`: profit updates that report a `hit_price` but no new entry.
- Cleans symbols and timestamps.
- Exports updated datasets to:
  - `data/telegram_signals_clean.csv`
  - `data/telegram_updates_clean.csv`
- Saves raw message content into a local JSON backup.

## Output Files
- `raw_messages.json`: Raw text and timestamps from Telegram.
- `telegram_signals_clean.csv`: Parsed, deduplicated, and clean trading signals.
- `telegram_updates_clean.csv`: Cleaned profit target updates.

In [None]:
# Import required libraries
import os
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
import re
from telethon import TelegramClient

# Load credentials from .env
load_dotenv()
API_ID = int(os.getenv("API_ID"))
API_HASH = os.getenv("API_HASH")
GROUP_ID = -1001717037581  # Telegram group ID

# Define paths
signals_csv = Path("../data/telegram_signals_clean.csv")
updates_csv = Path("../data/telegram_updates_clean.csv")
raw_path = Path("../data/raw/raw_messages.json")

# Load existing CSVs
existing_signals = pd.read_csv(signals_csv) if signals_csv.exists() else pd.DataFrame()
existing_updates = pd.read_csv(updates_csv) if updates_csv.exists() else pd.DataFrame()

# Get timestamp of most recent signal/update
last_signal_time = pd.to_datetime(existing_signals["timestamp"]).max() if not existing_signals.empty else datetime(2024, 1, 1)
last_update_time = pd.to_datetime(existing_updates["timestamp"]).max() if not existing_updates.empty else datetime(2024, 1, 1)

# Parse signal from text
def parse_signal(text):
    result = {}
    symbol_match = re.search(r"#([A-Z0-9]+)[^\s/]*/USDT", text)
    if symbol_match:
        result["symbol"] = symbol_match.group(1) + "/USDT"

    direction_match = re.search(r"\b(Long|Short)\b", text, re.IGNORECASE)
    if direction_match:
        result["direction"] = direction_match.group(1).capitalize()

    entry_match = re.search(r"Entry[^0-9]{0,10}(\d+\.\d+)", text)
    if entry_match:
        result["entry"] = float(entry_match.group(1))

    tp_matches = re.findall(r"(\d+\.\d+)\s*\((\d+)% of profit\)", text)
    for price, percent in tp_matches:
        result[f"tp_{percent}"] = float(price)

    price_hit = re.search(r"Price[^0-9]{0,10}(\d+\.\d+)", text)
    if price_hit:
        result["hit_price"] = float(price_hit.group(1))

    return result

# Main function
async def run_all():
    async with TelegramClient("session_felix", API_ID, API_HASH) as client:

        # Show the latest message from the group
        message = await client.get_messages(GROUP_ID, limit=1)
        print("Latest message from the group:\n")
        print(message[0].text)

        # Fetch new messages since last update
        messages = []
        async for msg in client.iter_messages(GROUP_ID, offset_date=min(last_signal_time, last_update_time), reverse=True):
            if msg.text:
                messages.append({"text": msg.text, "timestamp": msg.date.isoformat()})

        # Save messages to JSON
        with open(raw_path, "w", encoding="utf-8") as f:
            json.dump(messages, f, ensure_ascii=False, indent=2)

        # Parse and clean messages
        parsed_messages = []
        for msg in messages:
            parsed = parse_signal(msg["text"])
            parsed["timestamp"] = msg["timestamp"]
            parsed_messages.append(parsed)

        # Classify parsed messages
        signal_data = [m for m in parsed_messages if "entry" in m and any(k.startswith("tp_") for k in m)]
        update_data = [m for m in parsed_messages if "symbol" in m and "entry" not in m and "hit_price" in m]

        df_signals = pd.DataFrame(signal_data)
        df_updates = pd.DataFrame(update_data)

        # Remove incomplete rows
        df_signals.dropna(subset=["symbol", "entry", "tp_40", "tp_60", "tp_80", "tp_100"], inplace=True)
        df_updates.dropna(subset=["symbol", "hit_price"], inplace=True)

        # Clean symbol format
        def clean_symbol(s):
            if pd.isna(s): return None
            return re.sub(r"[^A-Z0-9/]", "", s).replace("/", "")

        df_signals["symbol"] = df_signals["symbol"].apply(clean_symbol)
        df_updates["symbol"] = df_updates["symbol"].apply(clean_symbol)

        # Format timestamps
        df_signals["timestamp"] = pd.to_datetime(df_signals["timestamp"])
        df_updates["timestamp"] = pd.to_datetime(df_updates["timestamp"])

        # Append and deduplicate with existing data
        df_signals = pd.concat([existing_signals, df_signals], ignore_index=True).drop_duplicates()
        df_updates = pd.concat([existing_updates, df_updates], ignore_index=True).drop_duplicates()

        # Export updated CSVs
        df_signals.to_csv(signals_csv, index=False)
        df_updates.to_csv(updates_csv, index=False)

        print(f"\nUpdated: {df_signals.shape[0]} signals, {df_updates.shape[0]} updates.")

# Run the function
await run_all()




In [None]:
import json
import pandas as pd
from pathlib import Path


# Load messages from the JSON file
raw_path = Path("../data/raw/raw_messages.json")

with open(raw_path, "r", encoding="utf-8") as f:
    raw_messages = json.load(f)

print(f"Messages loaded: {len(raw_messages)}")
print("Example:", raw_messages[0])