In [None]:
#from config import loading_parameters
from processing_data import main as process_data

from telethon.tl.functions.messages import GetHistoryRequest
from telethon.tl.types import User, PeerUser
from telethon.errors import FloodWaitError
from telethon import TelegramClient
from dotenv import load_dotenv 

import parsers.telegram.telegram_parse as telegram_parse
import parsers.discord.discord_parse as discord_parse
import parsers.instagram.instagram_parse as instagram_parse
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 
import yaml
import asyncio
import os 
import re
import json 
import time 


In [None]:
with open('config.yaml', 'r') as f:
    full_config = yaml.safe_load(f)

loading_parameters = full_config.get('loading_parameters', {})

ROOT_PATH                = os.path.abspath(os.getcwd())
T_LOCAL_JSON_PATH        = os.path.abspath(os.path.join(ROOT_PATH, loading_parameters.get("t_local_json_path")))
T_GLOBAL_SAVE_PATH       = os.path.abspath(os.path.join(ROOT_PATH, loading_parameters.get("t_global_save_path")))
DISCORD_PACKAGE_FOLDER   = os.path.abspath(os.path.join(ROOT_PATH, loading_parameters.get("discord_package_folder")))
INBOX_PATH               = os.path.abspath(os.path.join(ROOT_PATH, loading_parameters.get("inbox_path")))
SAVE_PATH                = os.path.abspath(os.path.join(ROOT_PATH, "Datasets/"))
# TODO: When finished, integrate .env with config.yaml
INSTAGRAM_USERNAME       = os.getenv('INSTAGRAM_USERNAME') 
T_PARSE_TYPE             = loading_parameters.get("t_parse_type")
TELEGRAM                 = loading_parameters.get("telegram")
INSTAGRAM                = loading_parameters.get("instagram")
DISCORD                  = loading_parameters.get("discord")
CHECKPOINTS              = loading_parameters.get("checkpoints")
SAVE_CSV                 = loading_parameters.get("save_csv")
MESSAGE_LIMIT            = loading_parameters.get("message_limit")
DIALOGS_LIMIT            = loading_parameters.get("dialogs_limit") 
VERBOSE                  = loading_parameters.get("verbose") 
THRESHOLD                = loading_parameters.get("threshold") 

In [None]:
async def main(telegram = TELEGRAM,
               instagram = INSTAGRAM,
               discord = DISCORD,
               discord_path = DISCORD_PACKAGE_FOLDER,
               **kwargs) -> list: 
      """
      Returns: 
            List: A list with true boolean parsers.
      """
      
      to_return = {}
      
      if telegram:
            telegram_df = await telegram_parse.main(parse_type=T_PARSE_TYPE,json_path=T_LOCAL_JSON_PATH, save_path=T_GLOBAL_SAVE_PATH, **kwargs,)
            to_return['telegram_dataset'] = telegram_df
      if instagram:
            instagram_df = instagram_parse.main(inbox_path=INBOX_PATH, instagram_username=INSTAGRAM_USERNAME, **kwargs)
            to_return['instagram_dataset'] = instagram_df
      if discord:
            discord_df = discord_parse.main(path=discord_path, **kwargs)
            to_return['discord_dataset'] = discord_df

      return to_return

In [None]:
kwargs = {
      "save_csv": SAVE_CSV,
      "message_limit": MESSAGE_LIMIT,
      "dialogs_limit": DIALOGS_LIMIT,
      "verbose": VERBOSE,
      "checkpoints": CHECKPOINTS,
      "threshold": THRESHOLD
}

datasets = await main(**kwargs)
      
# # Iterating over the dictionary key and assigning proper name to it. telegram_dataset = telegram_df
# for key, value in datasets.items():
#     locals()[key] = value

total_messages = sum([len(row_len) for row_len in datasets.values()])
print(f"Collected total of {total_messages} messages")  

# Concatenating dataframes
dataset = pd.DataFrame(columns=['Message', 'Sender', 'Date'])
for key, value in datasets.items():
      dataset = pd.concat([dataset, value])
      print(f"Concatenated {key}")

concatenated_path = os.path.abspath(os.path.join(SAVE_PATH, 'concatenated.csv'))
if not os.path.exists(concatenated_path):
      print(f"Saving to {concatenated_path}")
      dataset.to_csv(concatenated_path, index=False)

# Data Visualization 

In [None]:
concatenated_path = os.path.join(SAVE_PATH, 'concatenated.csv')
dataset = pd.read_csv(concatenated_path)
dataset["Message"].to_string
dataset.head(100)

In [None]:
def simplify_date(date): # For graphs 
      date = pd.to_datetime(date)
      return date.strftime("%Y-%m-%d")

In [None]:
dataset['Date'] = dataset['Date'].apply(simplify_date)
dataset["Date"].head(100)

In [None]:
# Missing values? 
dataset.isna().sum().sum()

In [None]:
# Sort by date 
dataset = dataset.sort_values('Date')

In [None]:
def plot_message_count_over_time(data, bar_width=0.35, font_size=12):
    data['Date'] = pd.to_datetime(data['Date'])

    fig, axs = plt.subplots(1, 3, figsize=(10, 5))
    # Get messages over years
    years_messages = dataset['Date'].dt.year.value_counts()
    top_year = years_messages.idxmax()

    # Get messages over months in the top year
    top_year_data = data[data['Date'].dt.year == top_year]
    months_messages = top_year_data['Date'].dt.month.value_counts()
    months_messages = months_messages.sort_index(ascending=True)

    # Plot the third set of bars -- msg/day in the top months
    top_months = top_year_data['Date'].dt.month.value_counts().idxmax()
    top_months_data = top_year_data[top_year_data['Date'].dt.month == top_months]
    day_messages = top_months_data['Date'].dt.day.value_counts()
    day_messages = day_messages.sort_index(ascending=True)

    # Plot Years
    axs[0].bar(years_messages.index, years_messages.values, color='red')
    axs[0].set_title(top_year, fontsize=font_size)
    axs[0].set_xlabel(f'Years', fontsize=font_size)
    axs[0].set_ylabel('Number of Messages', fontsize=font_size)

    # Plot Months 
    top_months = months_messages.index[-1]
    axs[1].bar(months_messages.index, months_messages.values, color='blue')
    axs[1].set_title(f"{top_months}/{top_year}", fontsize=font_size)
    axs[1].set_xlabel(f'Months', fontsize=font_size)
    axs[1].set_ylabel('Number of Messages', fontsize=font_size)

    # Plot days
    last_day = day_messages.index[-1]
    axs[2].bar(day_messages.index, day_messages.values, color='g')
    axs[2].set_title(f"{top_months}/1-{last_day}/{last_day}", fontsize=font_size)
    axs[2].set_xlabel(f'Days', fontsize=font_size)
    axs[2].set_ylabel('Number of Messages', fontsize=font_size)

    plt.tight_layout()
    plt.show()

In [None]:
def plot_heatmap(dataset):
    # Make a full copy of the dataset, not just the 'Date' column
    df = dataset.copy()

    # Ensure 'Date' is in datetime format
    df["Date"] = pd.to_datetime(df["Date"])

    # Extract Year, Month, and Day from the 'Date' column
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

    # Group by Year, Month, and Day and count occurrences
    grouped_data = df.groupby(['Year', 'Month', 'Day']).size().reset_index(name='Message Count')

    # Pivot table for heatmap (rows: 'Day', columns: 'Month-Year', values: 'Message Count')
    grouped_data['Month-Year'] = grouped_data['Year'].astype(str) + '-' + grouped_data['Month'].astype(str)
    heatmap_data = grouped_data.pivot(index='Day', columns='Month-Year', values='Message Count')

    # Create heatmap
    plt.figure(figsize=(12, 6))
    sns.heatmap(heatmap_data, cmap="coolwarm", cbar_kws={'label': 'Number of Messages'}, linewidths=0.1, linecolor='gray')

    plt.title('Messages Amount per Day')
    plt.xlabel('Month-Year')
    plt.ylabel('Days')

    plt.show()


In [None]:
plot_message_count_over_time(data=dataset), plot_heatmap(dataset)

## Data Processing

In [None]:
dataset = process_data()
dataset.head()

# Model Training 