#  Scale of [group]/[channel] Accounts

In [1]:
import json
import csv
import os
import pandas as pd
import re

data_dir = 'telegram_measurement' # [pfs-na-large] /datadrive/sangyiwu/RBSEO_Cybercrime_TimeMachine/telegram_measurement
accounts = os.listdir(data_dir)
print(f"The number of [group]/[channel] accounts crawled: {len(accounts)}")

group_num = 0
channel_num = 0
urls = []
with open("statistics.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['account', 'type', 'messages', 'users', 'urls'])

    for account in accounts:
        with open(f"{data_dir}/{account}/info.json", "r", encoding="utf-8") as fd:
            info = json.load(fd)
        user_num = 0
        if info['type'] == 'group':
            group_num += 1
            user_num = info['members']
        elif info['type'] == 'channel':
            channel_num += 1
            user_num = info['subscribers']

        messages = []
        with open(f"{data_dir}/{account}/messages_2022.json", "r", encoding="utf-8") as fd:
            lines = fd.readlines()
            for line in lines:
                message = json.loads(line)
                messages.append(message)
        url_num = 0
        for message in messages:
            if 'url' in message['media']:
                url_num += 1
                urls.append(message['media']['url'])
            urls_re = re.findall('(?:http|ftp|https):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', message['message'])
            url_num += len(urls_re)
            for url in urls_re:
                urls.append(url)

        csvwriter.writerow([info['username'], info['type'], len(messages), user_num, url_num])

print(f"The number of [group] account crawled: {group_num}")
print(f"The number of [channel] account crawled: {channel_num}")

pd.read_csv("statistics.csv", encoding="utf-8")


The number of [group]/[channel] accounts crawled: 2449
The number of [group] account crawled: 551
The number of [channel] account crawled: 1898


Unnamed: 0,account,type,messages,users,urls
0,AA8203AA15,channel,0,0,0
1,AA821EE,group,24011,479,4
2,Aa82269958,group,11,12,7
3,aa8245,channel,7,3,0
4,aa8247,channel,10,6,9
...,...,...,...,...,...
2444,ZSB551,channel,3,52,3
2445,ZSB558,channel,0,1,0
2446,zs_10086,channel,258,58271,0
2447,zzan1,channel,9,314,7


In [2]:
urls_stat = {}
for url in urls:
    urls_stat[url] = urls_stat.get(url, 0) + 1
urls_stat_sorted = sorted(urls_stat.items(), key=lambda x:x[1], reverse=True)

with open("urls.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['url', 'occurrence'])
    csvwriter.writerows(urls_stat_sorted)

print(f"The number of urls embedded in telegram messages: {len(urls)}")
print(f"The number of unique urls embedded in telegram messages: {len(set(urls))}")
pd.read_csv("urls.csv")

The number of urls embedded in telegram messages: 6471331
The number of unique urls embedded in telegram messages: 186453


Unnamed: 0,url,occurrence
0,https://launchpad.enjinstarter.com/projectlive...,183951
1,https://t.me/+lFfiuTJ5h1Y5NzBh,171508
2,https://t.me/+ymtV3POaXI9jNDcx,171148
3,https://t.me/+HqE7vKQPXX9iYjBh,77940
4,https://t.me/+d7Oa1JiMGwIzNTMx,76820
...,...,...
186448,http://180.215.213.194:8080/点击安装纸飞机简体中文语言包.rar,1
186449,https://www.instagram.com/sewingmachine_iran,1
186450,https://t.me/+LzTtwxQZl6g0M2Qy,1
186451,https://t.me/+NQgeLzk9mthmZjJi,1
