In [None]:
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from read_telega_dump import telega_dump_to_pandas


tlg_group_id = -1001688539638 
# telegram group id, can be taken from "copy message link"  in desctop telegram

### we can use either of the cells below to load data from dump

In [None]:
dump_path = r"D:\test_data\ChatExport_2024-03-14\result.json"
df = telega_dump_to_pandas(dump_path=dump_path)
# note: for data, taken from "export chat history", we do not have number of reactions 


In [None]:
tlg_group_id = -1001688539638
file_path = rf'data\chat{tlg_group_id}.parquet.gzip'
df =pd.read_parquet(file_path)
df.shape

In [None]:
df.dtypes

### Overall statistics

In [None]:
df['dummi'] = 1 # there should be less ugly way to get different aggegations without grouping
agg_df = df.groupby('dummi').aggregate({'msg_id':'count','msg_date':['min','max'], 'user_id': 'nunique'}).reset_index()
msgs_cnt = agg_df[( 'msg_id', 'count')][0]
min_date = agg_df[('msg_date', 'min')][0]
max_date = agg_df[('msg_date', 'max')][0]
number_of_days = (max_date-min_date).days
unique_users =  agg_df[('user_id', 'nunique')][0]
del df['dummi'] 
cols = ["Metric name", "Value"]
dt_format = "%Y-%m-%d"
d_report = [
    {cols[0]: "Number or messages", cols[1]: msgs_cnt},
    {cols[0]: "Date interval", cols[1]: f'from {min_date.strftime(dt_format)} to {max_date.strftime(dt_format)}' },
    {cols[0]: "Users count", cols[1]: unique_users },
    {cols[0]: "Avg messages per day", cols[1]: round(msgs_cnt/number_of_days , 1) },
    ]
pd.DataFrame.from_dict(d_report)






### assign date range if you want to restict dataset to some particular period for futher analysis, or just ignore the cell below 

In [None]:
tm_zn = 'Europe/Istanbul'
dt_range = (pd.Timestamp(year=2022, month=2, day=23, tz=tm_zn), pd.Timestamp(datetime.now(), tz=tm_zn))           
number_of_days = (dt_range[1]-dt_range[0]).days
df = df[df.msg_date.between(*dt_range)]


### aggregations by date

In [None]:
df_aggr = df.groupby(df.msg_date.dt.date).size().sort_values(ascending=False).reset_index(name='messages_count')
df_aggr[0:10].plot(x='msg_date', y='messages_count',kind='bar')
plt.show()

In [None]:
# Visualize the distribution of count of messages per day
df_aggr.messages_count.hist(bins=50)

In [None]:
df_aggr = df.groupby(df.msg_date.dt.hour).size().reset_index(name='messages_count')
df_aggr['messages_count'] =  round(df_aggr['messages_count']/number_of_days, 2)
df_aggr = df_aggr.rename(columns={'msg_date': 'Hour of day', 'messages_count': 'avg messages per hour'})
df_aggr[0:24].plot(x='Hour of day', y='avg messages per hour',kind='bar')
plt.show()