# Overview of Telegram Crawler

In [2]:
import pandas as pd
pd.read_csv(f"crawler_statistics.csv")

Unnamed: 0,Round,Contacts,Success,Fuzzy,Failure,Accounts,Users,Bots,Channels,Groups,Messages
0,20230221,1808,1078,328,730,2978,1315,191,1175,297,451225
1,20230228,1812,1083,330,729,2992,1317,190,1184,301,451380


# Weekly Statistics of Telegram Crawler

In [None]:
# Statistics of the search for Telegram contacts extracted from SEO terms

import json

data_dir = '/data2/sangyiwu/RBSEO_Cybercrime_TimeMachine/output_telegram'
date_to_analyze = '20230228'

result_stats_path = f'{data_dir}/result_stats_{date_to_analyze}.json'
result_stats_file = open(result_stats_path, 'r')
lines = result_stats_file.readlines()
results = []
for line in lines:
    results.append(json.loads(line))
    
contact_num = len(results)
success_num = 0
failure_num = 0
fuzzy_num = 0

for result in results:
    if result['is_success'] == True:
        success_num += 1
        if result['is_fuzzy'] == True:
            fuzzy_num += 1
    else:
        failure_num += 1

print(f"[Round#{date_to_analyze}]")
print(f"Total Contacts: {contact_num}")
print(f"Successful Search: {success_num}\tRatio: {success_num/contact_num}")
print(f"and of which Fuzzy Search: {fuzzy_num}\tRatio: {fuzzy_num/success_num}")
print(f"Failed Search: {failure_num}\tRatio: {failure_num/contact_num}")

[Round#20230228]
Total Contacts: 1812
Successful Search: 1083	Ratio: 0.597682119205298
and of which Fuzzy Search: 330	Ratio: 0.3047091412742382
Failed Search: 729	Ratio: 0.402317880794702


In [None]:
# Statistics of crawled accounts (account type & messages)

import json
import os

account_num = 0
user_num = 0
bot_num = 0
channel_num = 0
group_num = 0

message_num = 0

for result in results:
    if result['is_success'] == True:
        if result['is_fuzzy'] == False:
            with open(f"{data_dir}/{result['provider']}/{result['username']}/info.json", "r", encoding="utf-8") as info_file:
                info = json.load(info_file)
            account_num += 1
            if info['type'] == 'user':
                user_num += 1
            elif info['type'] == 'bot':
                bot_num += 1
            elif info['type'] == 'channel':
                channel_num += 1
            elif info['type'] == 'group':
                group_num += 1
                
            if info['type'] == 'channel' or info['type'] == 'group':
                with open(f"{data_dir}/{result['provider']}/{result['username']}/messages.json", "r", encoding="utf-8") as message_file:
                    messages = message_file.readlines()
                    message_num += len(messages)
                    
        elif result['is_fuzzy'] == True:
            sub_usernames = os.listdir(f"{data_dir}/{result['provider']}/{result['username']}")
            for sub_username in sub_usernames:
                with open(f"{data_dir}/{result['provider']}/{result['username']}/{sub_username}/info.json", "r", encoding="utf-8") as info_file:
                    info = json.load(info_file)
                account_num += 1
                if info['type'] == 'user':
                    user_num += 1
                elif info['type'] == 'bot':
                    bot_num += 1
                elif info['type'] == 'channel':
                    channel_num += 1
                elif info['type'] == 'group':
                    group_num += 1
                
                if info['type'] == 'channel' or info['type'] == 'group':
                    with open(f"{data_dir}/{result['provider']}/{result['username']}/{sub_username}/messages.json", "r", encoding="utf-8") as message_file:
                        messages = message_file.readlines()
                        message_num += len(messages)

print(f"[Round#{date_to_analyze}]")
print(f"The number of Accounts crawled: {account_num}")
print(f"The number of [user] accounts crawled: {user_num}")
print(f"The number of [bot] accounts crawled: {bot_num}")
print(f"The number of [channel] accounts crawled: {channel_num}")
print(f"The number of [group] accounts crawled: {group_num}")
print(f"The number of messages in the recent week: {message_num}")

[Round#20230228]
The number of Accounts crawled: 2992
The number of [user] accounts crawled: 1317
The number of [bot] accounts crawled: 190
The number of [channel] accounts crawled: 1184
The number of [group] accounts crawled: 301
The number of messages in the recent week: 451380


#  Statistics of [group]/[channel] Accounts

In [1]:
import json
import csv
import os
import pandas as pd
import re
from datetime import date

data_dir = 'telegram_measurement'
accounts = os.listdir(data_dir)
print(f"The number of [group]/[channel] accounts crawled: {len(accounts)}")

group_num = 0
channel_num = 0
message_num = 0
subscriber_num = 0
member_num = 0
urls = []
members = []

statistics = []
for account in accounts:
    with open(f"{data_dir}/{account}/info.json", "r", encoding="utf-8") as fd:
        info = json.load(fd)
    user_num = 0
    if info['type'] == 'group':
        group_num += 1
        user_num = info['members']
        member_num += user_num
        with open(f"{data_dir}/{account}/members.json", "r", encoding="utf-8") as fd:
            lines = fd.readlines()
            for line in lines:
                member = json.loads(line)
                members.append(member)
    elif info['type'] == 'channel':
        channel_num += 1
        user_num = info['subscribers']
        subscriber_num += user_num

    messages = []
    with open(f"{data_dir}/{account}/messages_2022.json", "r", encoding="utf-8") as fd:
        lines = fd.readlines()
        for line in lines:
            message = json.loads(line)
            messages.append(message)
    message_num += len(messages)

    url_num = 0
    for message in messages:
        if 'url' in message['media']:
            url_num += 1
            urls.append(message['media']['url'])
        urls_re = re.findall('(?:http|ftp|https):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', message['message'])
        url_num += len(urls_re)
        for url in urls_re:
            urls.append(url)

    statistics.append([info['username'], info['type'], len(messages), user_num, url_num])

statistics_sorted = sorted(statistics, key=lambda x:x[2], reverse=True)
with open(f"statistics.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['account', 'type', 'messages', 'users', 'urls'])
    csvwriter.writerows(statistics_sorted)

print(f"The number of [group] accounts crawled: {group_num}")
print(f"The number of [channel] accounts crawled: {channel_num}")
print(f"The number of messages in 2022 crawled: {message_num}")

pd.read_csv(f"statistics.csv", encoding="utf-8").head(20)

The number of [group]/[channel] accounts crawled: 2652
The number of [group] accounts crawled: 586
The number of [channel] accounts crawled: 2066
The number of messages in 2022 crawled: 12096168


Unnamed: 0,account,type,messages,users,urls
0,Coin919,group,988688,343,1781487
1,bcapp,group,988127,263,1869796
2,Nanyangdanbao_888,group,916506,8651,262902
3,MiFeng01,group,772357,11826,47966
4,usdc168,group,697905,3669,1170213
5,nanyangdb,group,598473,4341,115684
6,mood_a3,group,542303,2043,650
7,sultanategosdonate,group,409688,607,512
8,YVQQL,group,408338,1939,989
9,olddriver_cdq,group,348944,30424,243


In [2]:
print(f"The number of subscribers in [channel]: {subscriber_num}")
print(f"The number of members in [group]: {member_num}")

members_stat = {}
for member in members:
    key = (member['id'], member['username'], member['first_name'], member['last_name'], member['is_bot'])
    members_stat[key] = members_stat.get(key, 0) + 1
members_stat_sorted = sorted(members_stat.items(), key=lambda x:x[1], reverse=True)
print(f"The number of unique members in [group]: {len(set(members_stat_sorted))}")

with open(f"members.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['id', 'username', 'first_name', 'last_name', 'is_bot', 'occurrence'])
    for member in members_stat_sorted:
        csvwriter.writerow(list(member[0]) + [member[1]])

pd.read_csv(f"members.csv").head(20)


The number of subscribers in [channel]: 24517959
The number of members in [group]: 1400755
The number of unique members in [group]: 552638


Unnamed: 0,id,username,first_name,last_name,is_bot,occurrence
0,162726413,GroupHelpBot,Group Help,,True,61
1,609517172,MissRose_bot,Rose,,True,36
2,208056682,GHSecurityBot,🛠 Security ¹ 🛠,,True,13
3,210944655,combot,Combot,,True,13
4,546292062,joinhider_bot,joinhider_bot,,True,13
5,734284134,deljoinbot,Auto Delete Join Message,,True,12
6,1987639632,joinhide20_bot,@joinhide20_bot,,True,11
7,1728012791,AntiJoinMessageBot,Auto Delete Join Message,,True,8
8,696267355,ProtectronBot,Protectron,,True,8
9,5930212334,GroupHelp0Bot,GroupHelp,,True,8


In [3]:
urls_stat = {}
for url in urls:
    urls_stat[url] = urls_stat.get(url, 0) + 1
urls_stat_sorted = sorted(urls_stat.items(), key=lambda x:x[1], reverse=True)

with open(f"urls.csv", "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['url', 'occurrence'])
    csvwriter.writerows(urls_stat_sorted)

print(f"The number of urls embedded in telegram messages: {len(urls)}")
print(f"The number of unique urls embedded in telegram messages: {len(set(urls))}")
pd.read_csv(f"urls.csv").head(20)

The number of urls embedded in telegram messages: 6487209
The number of unique urls embedded in telegram messages: 190024


Unnamed: 0,url,occurrence
0,https://launchpad.enjinstarter.com/projectlive...,183951
1,https://t.me/+lFfiuTJ5h1Y5NzBh,171508
2,https://t.me/+ymtV3POaXI9jNDcx,171148
3,https://t.me/+HqE7vKQPXX9iYjBh,77940
4,https://t.me/+d7Oa1JiMGwIzNTMx,76820
5,https://t.me/+mwN9pn52YAdmYjZh,76284
6,https://t.me/+yd6CAEQ3yNJlZTU5,70732
7,https://t.me/+Z2ZzibdwAA9jMjcx,64895
8,https://t.me/+ofWerI9UuP1hZmUx,60128
9,https://t.me/+viNjG8iEY3cwNTYx,55830
