In [1]:
# Access the telegram profiles and parses all context and responses in the profile.

In [44]:
from telethon import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
from telethon.tl.types import User
from telethon.errors import FloodWaitError
import asyncio
import openai
from dotenv import load_dotenv
import json
import os
import sys

In [73]:
dotenv_path = ".env"
load_dotenv(dotenv_path=dotenv_path)

api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_HASH_ID')
phone_number = os.getenv('PHONE_NUMBER')
my_telegram_id = os.getenv('MY_TELEGRAM_ID')
session_name = "telegram_parser"
client = TelegramClient(session_name, api_id, api_hash)


In [4]:
openai.api_key = os.getenv('OPENAI_API_KEY')

def categorize_message(message):
    prompt = f'Classify the following message as either "context" or "response":\n\n"{message}"\n\nAnswer with one word only.'
    
    response = openai.Completion.create(
        engine="text-davinci-003",  # Choose a suitable engine
        prompt=prompt,
        max_tokens=1,
        n=1,
        stop=None,
        temperature=0.5,
    )
    
    return response.choices[0].text.strip()

In [5]:
def optimize_messages(messages):
      """
      Function which uses a set of tuning algorithms to meet the criteria of optimized data for future models.
      """

      # TODO: Include only messages in ukrainian language


      # TODO: Put todos below in order of priority 
      # For each of the points below, if true: add one, if false: minus one
      # TODO: Add detection system for context and response:

      # TODO: If the message contains question mark in the end of the message, it is a context
      # TODO: The first message of the new day is probably a context.
      # TODO: If there are few messages in a row from user, concatenate them into one message.
      # TODO: If there is a significant time gap (e.g., several hours) between messages, the first message after the gap might be a context.
      # TODO: Look for specific keywords or phrases that typically indicate a context (e.g., "What do you think about...", "Can you explain...", "Why is...").
      # TODO: If a message is a direct reply to a previous context message, it is likely a response.
      # TODO: Short messages that directly follow a context are likely responses.
      # TODO: If the same user repeatedly sends messages ending with question marks or messages at the start of the day, those are likely contexts.
       

In [92]:
async def extract_message_info(messages):
      extracted_dialog = []
      last_message=None

      for message in messages:
            try: 
                  text = message.message.strip() if message.message else ""
                  sender = message.from_id if message.from_id else (await client.get_entity(message.peer_id)).id
                  
                  #sent_by_me = my_telegram_id == sender   
                  date = message.date 
            except FloodWaitError as e:
                  print(f"FloodWaitError: sleeping for {e.seconds} seconds.")
                  await asyncio.sleep(e.seconds)
                  continue 
                  
            if text:
                  if last_message and sender == last_message[1]:
                        last_message[0] = " ".join([last_message[0], text])
                  else:
                        if last_message:
                              extracted_dialog.append(last_message)
                        last_message = [text, sender, sent_by_me, date]

      if last_message:
            extracted_dialog.append(last_message)
      
      return extracted_dialog


In [168]:
async def get_total_messages(session_name, api_id, api_hash, phone_number, only_personal=False):
      """
      Shows a total amount of messages that your account has. 
      """
      total_messages = 0

      async with TelegramClient(session_name, api_id, api_hash) as client:
            client.start(phone_number)
            dialogs = await client.get_dialogs()

            if only_personal:
                  dialogs = [dialog for dialog in dialogs if isinstance(dialog.entity, User)]
                  print(f"Total dialogs: {len(dialogs)}")
            for dialog in dialogs:
                  async for message in client.iter_messages(dialog.entity, limit=None):
                        total_messages += 1
            print(f"Total messages: {total_messages}")
            client.disconnect()
            return total_messages

In [169]:
# Takes some time to run
%time
total_messages = await get_total_messages(session_name, api_id, api_hash, phone_number, only_personal=True)

CPU times: user 12 µs, sys: 8 µs, total: 20 µs
Wall time: 37.9 µs


CancelledError: 

In [97]:
import pandas as pd

async def parse_data(threshold: int =50, message_limit=None, dialogs_limit: int = 100, verbose=1, top_chats_first: bool =False):
    """
    Parses all the messages in the profile.
    
    Args:
        threshold: int
            The minimum amount of messages in a dialog to be processed.
        message_limit: int
            The maximum amount of messages to be processed in a dialog.
        dialogs_limit: int
            The maximum amount of dialogs to be processed.
        verbose: int
            The amount of output to be printed.
        top_chats_first: bool
            Whether to process chats with most messages first.

    Returns:
        pd.DataFrame
            The parsed data.
    """
    async with client:
        dialogs = await client.get_dialogs()
        dialogs = [dialog for dialog in dialogs if isinstance(dialog.entity, User)]
        dialogs = [dialog for dialog in dialogs if not dialog.entity.bot]
        dialogs = dialogs[:dialogs_limit]
        if verbose: 
            total = 0 
            print(f"Total dialogs: {len(dialogs)}")
        filtered_dialogs = pd.DataFrame(columns=["Message", "Sender", "Date"])
        for dialog in dialogs[:dialogs_limit]:
            messages = await client.get_messages(dialog.entity, limit=message_limit)

            total_messages = len(messages)
            print(f"Processing dialog. Total messages: {total_messages}")
            if total_messages > threshold:
                extracted_dialog = await extract_message_info(messages)
                filtered_dialogs = pd.concat([filtered_dialogs, pd.DataFrame(extracted_dialog, columns=["Message", "Sender", "Sent_by_me","Date"])])
                if verbose: 
                    total += 1
                    print(f"Dialogs processed: {total}, left: {len(dialogs) - total}") 
        return filtered_dialogs

In [99]:
%time
async def main():
    if os.path.exists(f"parsers\{session_name}.session-journal"):
        print(f"Session {session_name} exists. Please delete it and restart the script. Or change the session name in the script.")
        sys.exit()
    else:
        await client.start(phone_number)
        print(f"Connecting with {client.session}")
        data = await parse_data(message_limit=100, dialogs_limit=None, verbose=1)
        data = pd.DataFrame(data, columns=["Message", "Sender", "Sent_by_me", "Date"])
        return data
        client.disconnect()
        print("DONE")

data = await main()
#client.disconnect

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 5.01 µs
Connecting with <telethon.sessions.sqlite.SQLiteSession object at 0x1311623a0>
Total dialogs: 161
Processing dialog. Total messages: 100


  filtered_dialogs = pd.concat([filtered_dialogs, pd.DataFrame(extracted_dialog, columns=["Message", "Sender", "Sent_by_me","Date"])])


Dialogs processed: 1, left: 160
Processing dialog. Total messages: 53
Dialogs processed: 2, left: 159
Processing dialog. Total messages: 100
Dialogs processed: 3, left: 158
Processing dialog. Total messages: 100
Dialogs processed: 4, left: 157
Processing dialog. Total messages: 100
Dialogs processed: 5, left: 156
Processing dialog. Total messages: 100
Dialogs processed: 6, left: 155
Processing dialog. Total messages: 67
Dialogs processed: 7, left: 154
Processing dialog. Total messages: 100
Dialogs processed: 8, left: 153
Processing dialog. Total messages: 100
Dialogs processed: 9, left: 152
Processing dialog. Total messages: 8
Processing dialog. Total messages: 100
Dialogs processed: 10, left: 151
Processing dialog. Total messages: 2
Processing dialog. Total messages: 63
Dialogs processed: 11, left: 150
Processing dialog. Total messages: 1
Processing dialog. Total messages: 100
Dialogs processed: 12, left: 149
Processing dialog. Total messages: 1
Processing dialog. Total messages: 4
Pr

Error executing high-level request after reconnect: <class 'telethon.errors.rpcerrorlist.FloodWaitError'>: A wait of 79 seconds is required (caused by GetUsersRequest)
  filtered_dialogs = pd.concat([filtered_dialogs, pd.DataFrame(extracted_dialog, columns=["Message", "Sender", "Sent_by_me","Date"])])


Dialogs processed: 20, left: 141
Processing dialog. Total messages: 100
Dialogs processed: 21, left: 140
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 100
Dialogs processed: 22, left: 139
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 90
Dialogs processed: 23, left: 138
Processing dialog. Total messages: 1
Processing dialog. Total messages: 12
Processing dialog. Total messages: 1
Processing dialog. Total messages: 26
Processing dialog. Total messages: 1
Processing dialog. Total messages: 4
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 100
Dialogs processed: 24, left: 137
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing d

  filtered_dialogs = pd.concat([filtered_dialogs, pd.DataFrame(extracted_dialog, columns=["Message", "Sender", "Sent_by_me","Date"])])


Dialogs processed: 35, left: 126
Processing dialog. Total messages: 4
Processing dialog. Total messages: 62
Dialogs processed: 36, left: 125
Processing dialog. Total messages: 2
Processing dialog. Total messages: 15
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 26
Processing dialog. Total messages: 21
Processing dialog. Total messages: 6
Processing dialog. Total messages: 2
Processing dialog. Total messages: 24
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 1
Processing dialog. Total messages: 2
Processing dialog. Total messages: 1
Proc

In [116]:
data["Sent_by_me"] = data["Sent_by_me"].astype(str)
data

SyntaxError: invalid syntax (740105273.py, line 1)

In [113]:
text_to_match = ""

mask = data["Sender"] == text_to_match 
mask

0     False
1     False
2     False
3     False
4     False
      ...  
35    False
36    False
37    False
38    False
39    False
Name: Sender, Length: 1283, dtype: bool

In [None]:
client = TelegramClient('telegram_parse', api_id, api_hash)
dialogs = client.get_dialogs()
print(dialogs[:2])
client.disconnect()