#  Scale of [group]/[channel] Accounts

In [6]:
import json
import csv
import os
import pandas as pd
import re

data_dir = 'telegram_measurement'
accounts = os.listdir(data_dir)
print(f"The number of [group]/[channel] accounts crawled: {len(accounts)}")

group_num = 0
channel_num = 0
message_num = 0
subscriber_num = 0
member_num = 0
urls = []
members = []
with open("statistics.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['account', 'type', 'messages', 'users', 'urls'])

    for account in accounts:
        with open(f"{data_dir}/{account}/info.json", "r", encoding="utf-8") as fd:
            info = json.load(fd)
        user_num = 0
        if info['type'] == 'group':
            group_num += 1
            user_num = info['members']
            member_num += user_num
            with open(f"{data_dir}/{account}/members.json", "r", encoding="utf-8") as fd:
                lines = fd.readlines()
                for line in lines:
                    member = json.loads(line)
                    members.append(member)
        elif info['type'] == 'channel':
            channel_num += 1
            user_num = info['subscribers']
            subscriber_num += user_num

        messages = []
        with open(f"{data_dir}/{account}/messages_2022.json", "r", encoding="utf-8") as fd:
            lines = fd.readlines()
            for line in lines:
                message = json.loads(line)
                messages.append(message)
        message_num += len(messages)

        url_num = 0
        for message in messages:
            if 'url' in message['media']:
                url_num += 1
                urls.append(message['media']['url'])
            urls_re = re.findall('(?:http|ftp|https):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', message['message'])
            url_num += len(urls_re)
            for url in urls_re:
                urls.append(url)

        csvwriter.writerow([info['username'], info['type'], len(messages), user_num, url_num])

print(f"The number of [group] account crawled: {group_num}")
print(f"The number of [channel] account crawled: {channel_num}")
print(f"The number of messages in 2022 crawled: {message_num}")

pd.read_csv("statistics.csv", encoding="utf-8")


The number of [group]/[channel] accounts crawled: 2612
The number of [group] account crawled: 578
The number of [channel] account crawled: 2034
The number of messages in 2022 crawled: 12088801


Unnamed: 0,account,type,messages,users,urls
0,AA8203AA15,channel,0,0,0
1,AA821EE,group,24011,479,4
2,Aa82269958,group,11,12,7
3,aa8245,channel,7,3,0
4,aa8247,channel,10,6,9
...,...,...,...,...,...
2607,ZSB558,channel,0,1,0
2608,zs_10086,channel,258,58271,0
2609,zyhsx,channel,3,40,0
2610,zzan1,channel,9,314,7


In [7]:
print(f"The number of subscribers in [channel]: {subscriber_num}")
print(f"The number of members in [group]: {member_num}")

members_stat = {}
for member in members:
    key = (member['id'], member['username'], member['first_name'], member['last_name'], member['is_bot'])
    members_stat[key] = members_stat.get(key, 0) + 1
members_stat_sorted = sorted(members_stat.items(), key=lambda x:x[1], reverse=True)
print(f"The number of unique members in [group]: {len(set(members_stat_sorted))}")

with open("members.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['id', 'username', 'first_name', 'last_name', 'is_bot', 'occurrence'])
    for member in members_stat_sorted:
        csvwriter.writerow(list(member[0]) + [member[1]])

pd.read_csv("members.csv")


The number of subscribers in [channel]: 24188805
The number of members in [group]: 1350522
The number of unique members in [group]: 529241


Unnamed: 0,id,username,first_name,last_name,is_bot,occurrence
0,162726413,GroupHelpBot,Group Help,,True,60
1,609517172,MissRose_bot,Rose,,True,35
2,208056682,GHSecurityBot,🛠 Security ¹ 🛠,,True,13
3,210944655,combot,Combot,,True,13
4,546292062,joinhider_bot,joinhider_bot,,True,13
...,...,...,...,...,...,...
529236,1496272369,Ficydtis,𝑍𝑂𝐸,,False,1
529237,1680240423,avga_99,عہۣۗمہۣۗرۣۗوشۣۜۖۦ,⁽♔₎,False,1
529238,1447808317,,,,False,1
529239,5254313458,xiu76253,Donna,Farner,False,1


In [8]:
urls_stat = {}
for url in urls:
    urls_stat[url] = urls_stat.get(url, 0) + 1
urls_stat_sorted = sorted(urls_stat.items(), key=lambda x:x[1], reverse=True)

with open("urls.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['url', 'occurrence'])
    csvwriter.writerows(urls_stat_sorted)

print(f"The number of urls embedded in telegram messages: {len(urls)}")
print(f"The number of unique urls embedded in telegram messages: {len(set(urls))}")
pd.read_csv("urls.csv")

The number of urls embedded in telegram messages: 6486370
The number of unique urls embedded in telegram messages: 189710


Unnamed: 0,url,occurrence
0,https://launchpad.enjinstarter.com/projectlive...,183951
1,https://t.me/+lFfiuTJ5h1Y5NzBh,171508
2,https://t.me/+ymtV3POaXI9jNDcx,171148
3,https://t.me/+HqE7vKQPXX9iYjBh,77940
4,https://t.me/+d7Oa1JiMGwIzNTMx,76820
...,...,...
189705,http://180.215.213.194:8080/点击安装纸飞机简体中文语言包.rar,1
189706,https://www.instagram.com/sewingmachine_iran,1
189707,https://t.me/+LzTtwxQZl6g0M2Qy,1
189708,https://t.me/+NQgeLzk9mthmZjJi,1
