In [5]:
# Parses and organizes all the messages in telegram account.

In [None]:
#openai.api_key = os.getenv('OPENAI_API_KEY')

def categorize_message(message):
    prompt = f'Classify the following message as either "context" or "response":\n\n"{message}"\n\nAnswer with one word only.'
    
    response = openai.Completion.create(
        engine="text-davinci-003",  # Choose a suitable engine
        prompt=prompt,
        max_tokens=1,
        n=1,
        stop=None,
        temperature=0.5,
    )
    
    return response.choices[0].text.strip()

In [1]:
from telethon import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
from telethon.tl.types import User, PeerUser
from telethon.errors import FloodWaitError
import asyncio
import time 
import openai
from dotenv import load_dotenv
import json
import os
import sys

In [2]:
dotenv_path = ".env"
load_dotenv(dotenv_path=dotenv_path)

api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_HASH_ID')
phone_number = os.getenv('PHONE_NUMBER')
my_telegram_id = os.getenv('my_telegram_id')
session_name = "telegram_parser"
client = TelegramClient(session_name, api_id, api_hash)


In [14]:
async def get_total_messages(session_name, api_id, api_hash, phone_number, only_personal=True):
      """
      Shows a total amount of messages that your account has. 
      """
      total_messages = 0

      async with TelegramClient(session_name, api_id, api_hash) as client:
            client.start(phone_number)
            dialogs = await client.get_dialogs()

            if only_personal:
                  dialogs = [dialog for dialog in dialogs if isinstance(dialog.entity, User)]
                  print(f"Total dialogs: {len(dialogs)}")
            for dialog in dialogs:
                  async for message in client.iter_messages(dialog.entity, limit=None):
                        total_messages += 1
            print(f"Total messages: {total_messages}")
            client.disconnect()
            return total_messages

In [7]:
# Takes some time to run
#%time
#total_messages = await get_total_messages(session_name, api_id, api_hash, phone_number, only_personal=True)

In [15]:
async def extract_message_info(messages):
      extracted_dialog = []
      last_message=None

      for message in messages:
            try: 
                  text = message.message.strip() if message.message else ""
                  sender = message.from_id if message.from_id else (await client.get_entity(message.peer_id)).id
                  sender = sender.user_id if isinstance(sender, PeerUser) else sender # Deletes PeerUser classes and keeps only int id

                  #sent_by_me = my_telegram_id == sender   
                  date = message.date 
            except FloodWaitError as e:
                  print(f"FloodWaitError: sleeping for {e.seconds} seconds.")
                  await asyncio.sleep(e.seconds)
                  continue 
                  
            if text:
                  if last_message and sender == last_message[1]:
                        last_message[0] = " ".join([last_message[0], text])
                  else:
                        if last_message:
                              extracted_dialog.append(last_message)
                        last_message = [text, sender, date]

      if last_message:
            extracted_dialog.append(last_message)
      
      return extracted_dialog

In [None]:
def exclude_single_dialogs(dialogs):
     """
    Eliminates chats with no response from second user.

     Accepts a list of dialogs [client.get_dialogs()]
     Returns chats where both participants took participation in dialog.
     """

     dialogs = [dialog for dialog in dialogs if len(dialog.participants) == 2]


In [46]:
import pandas as pd

async def parse_data(threshold: int =50, 
                     message_limit=None,
                      dialogs_limit: int = 100,
                      verbose=1,
                      checkpoints: bool = True):
    """
    Parses all the messages in the profile.
    
    Args:
        threshold: int
            The minimum amount of messages in a dialog to be processed.
        message_limit: int
            The maximum amount of messages to be processed in a dialog.
        dialogs_limit: int
            The maximum amount of dialogs to be processed.
        verbose: int
            The amount of output to be printed.
        top_chats_first: bool
            Whether to process chats with most messages first.

    Returns:
        pd.DataFrame
            The parsed data.
    """
    async with client:

        dialogs = await client.get_dialogs()
        dialogs = [dialog for dialog in dialogs if isinstance(dialog.entity, User)]
        dialogs = [dialog for dialog in dialogs if not dialog.entity.bot]
        my_telegram_id = int((await client.get_me()).id)
        dialogs = [dialog for dialog in dialogs if dialog.entity.id != my_telegram_id]
        dialogs = dialogs[:dialogs_limit]
        filtered_dialogs = pd.DataFrame(columns=["Message", "Sender","Date"])

        if verbose: 
            total = 0
            print(f"Total dialogs: {len(dialogs)}")

        # Check for checkpoint
        if checkpoints:
            if os.path.exists("checkpoint.pkt"):
                checkpoint = pd.read_pickle("checkpoint.pkt")
                filtered_dialogs = checkpoint["data"]
                total = checkpoint["last_iter"]
                dialogs = dialogs[total-1:]
                print(f"Resuming from checkpoint. Dialogs left: {len(dialogs)}.")

        # Main loop
        for dialog in dialogs[:dialogs_limit]:
            start_time = time.time() if verbose else None
            messages_info = []
            async for message in client.iter_messages(dialog.entity, limit=message_limit, wait_time=10):
                messages_info.append(message)

            total_messages = len(messages_info)
            if total_messages > threshold:
                extracted_dialog = await extract_message_info(messages_info)
                filtered_dialogs = pd.concat([filtered_dialogs, pd.DataFrame(extracted_dialog, columns=["Message", "Sender", "Date"])])
                if verbose: 
                    total += 1
                    run_time = time.time() - start_time
                    print(f"Dialogs processed: {total}, left: {len(dialogs) - total}. Run time: {run_time:.2f} seconds") 
            if checkpoints:
                checkpoint = {"data": filtered_dialogs,
                               "last_iter": total}
                pd.to_pickle(checkpoint, "checkpoint.pkt")
        if os.path.exists("checkpoint.pkt"):
            os.remove("checkpoint.pkt")
        
        return filtered_dialogs

## **If you have >10k messages, it will take a long time to run. Hope you are patient.**

In [47]:
%time
async def main():
    if os.path.exists(f"parsers\{session_name}.session-journal"):
        print(f"Session {session_name} exists. Please delete it and restart the script. Or change the session name in the script.")
        sys.exit()
    else:
        await client.start(phone_number)
        print(f"Connecting with {client.session}")
        data = await parse_data(message_limit=100, dialogs_limit=10, verbose=1, checkpoints=True)
        data = pd.DataFrame(data, columns=["Message", "Sender", "Date"])
        data["Sent_by_me"] = int(my_telegram_id) == data["Sender"]
        return data
        client.disconnect()
        print("DONE")

data = await main()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
Connecting with <telethon.sessions.sqlite.SQLiteSession object at 0x109808c10>
Total dialogs: 10
Resuming from checkpoint. Dialogs left: 10.


CancelledError: 

In [3]:
if os.path.exists("parsers/full_telegram_data.csv"):
      print("File with the same name already exists. Do you want to overwrite it? (y/n)")
      if input() == "y":
            data.to_csv(r'full_telegram_data.csv', index=False)
      else:
            print("File not overwritten.")
            sys.close()