In [None]:
import pandas as pd

CSV_FILE = 'url_stream.csv'

df = pd.read_csv(CSV_FILE, parse_dates=['timestamp'], low_memory=False)
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', format="ISO8601", utc=True)
df = df.dropna(subset=['timestamp'])
df.head()

Unnamed: 0,timestamp,author,url,text
0,2025-11-21 00:43:36+00:00,did:plc:uld74vzf773y7ovqqm2jfaft,https://kripta.biz/posts/4061939A-4564-4D84-A5...,Идеальный способ играть в классический Солитёр...
1,2025-11-21 00:53:19.926000+00:00,did:plc:hacoy4ddxz2wyeagydupyzzo,www.twitch.tv/twilightking...,I feel like playing Spongebob Squarepants: Rev...
2,2025-11-21 00:53:20+00:00,did:plc:kpohhpkfdzymywdtyjz37z5x,https://l.medisite.fr/hUd,"Delphine, atteinte du syndrome de Gougerot Sjö..."
3,2025-11-21 00:53:22.373000+00:00,did:plc:p6hvsoitychm67ci6jodjbus,https://www.reddit.com/r/ProgressiveHQ/s/DPi4C...,www.reddit.com/r/Progressiv...
4,2025-11-21 00:53:23.473000+00:00,did:plc:gcjury2g5lleigul7so24sa6,https://www.sacpeace.org/,#StopWars – Weekly vigil – At corner of 16th &...


In [4]:
from urllib.parse import urlparse

def get_domain(url):
    try:
        if not isinstance(url, str):
            return 'invalid'
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        parsed = urlparse(url)
        domain = parsed.netloc
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain.lower()
    except:
        return 'error'

df['domain'] = df['url'].apply(get_domain)
print("Domains extracted.")
display(df[['url', 'domain']].head())


Domains extracted.


Unnamed: 0,url,domain
0,https://kripta.biz/posts/4061939A-4564-4D84-A5...,kripta.biz
1,www.twitch.tv/twilightking...,twitch.tv
2,https://l.medisite.fr/hUd,l.medisite.fr
3,https://www.reddit.com/r/ProgressiveHQ/s/DPi4C...,reddit.com
4,https://www.sacpeace.org/,sacpeace.org


In [9]:
# Cell 3: Suspicious Users (High Frequency on Same Domain)

# 1. Calculate stats per author
author_stats = df.groupby('author').agg(
    total_posts=('url', 'count'),
    unique_domains=('domain', 'nunique')
).reset_index()

# 2. Find top domain per author
top_domains = df.groupby(['author', 'domain']).size().reset_index(name='domain_count')
top_domains = top_domains.sort_values(['author', 'domain_count'], ascending=[True, False])
top_domains = top_domains.groupby('author').first().reset_index()

# 3. Merge stats
author_analysis = pd.merge(author_stats, top_domains, on='author')
author_analysis['domain_share'] = author_analysis['domain_count'] / author_analysis['total_posts']

# 4. Filter for suspicious authors
# Criteria: At least 5 posts, and > 60% of posts are to the same domain
suspicious_authors = author_analysis[
    (author_analysis['total_posts'] >= 5) & 
    (author_analysis['domain_share'] >= 0.6)
].copy()

# 5. Calculate time period and frequency
# We need to go back to the original df to get timestamps for these authors
suspicious_posts = df[df['author'].isin(suspicious_authors['author'])]

# Group by author to get time range
time_stats = suspicious_posts.groupby('author')['timestamp'].agg(['min', 'max']).reset_index()

time_stats['duration'] = time_stats['max'] - time_stats['min']
time_stats['duration_seconds'] = time_stats['duration'].apply(lambda x: x.total_seconds())

# Merge time stats back
suspicious_authors = pd.merge(suspicious_authors, time_stats, on='author')

# Calculate frequency (posts per minute)
# Add a small epsilon to duration to avoid division by zero if all posts are at the exact same second
suspicious_authors['posts_per_minute'] = suspicious_authors['total_posts'] / ((suspicious_authors['duration_seconds'] / 60) + 0.001)

# Sort by frequency
suspicious_authors = suspicious_authors.sort_values('posts_per_minute', ascending=False)

print(f"Found {len(suspicious_authors)} suspicious authors.")
display(suspicious_authors[['author', 'domain', 'total_posts', 'domain_share', 'posts_per_minute']].head(20))

Found 429 suspicious authors.


Unnamed: 0,author,domain,total_posts,domain_share,posts_per_minute
424,did:plc:znzzh3aeupdkat6tfmwom4vb,zisshit.com,7,1.0,6176.470588
288,did:plc:phkhtc3tuxi2a5j26g6hq3gg,thetuitioncenter.com,5,1.0,4918.032787
212,did:plc:jibe7uwcon4ubbdvxm6cgiqp,vogue.co.jp,11,1.0,2610.893756
77,did:plc:7v5maqne6yqdmc75gmldctp7,aws.amazon.com,5,1.0,317.124736
269,did:plc:o5l3trj5pxm7otnqoevdwprj,github.com,10,1.0,261.437908
111,did:plc:cktf4dr6f6miivgifw7kzccb,revillution.net,6,1.0,71.146245
210,did:plc:jecoumyhzsfzs575gw7poz6s,shimotsuke.co.jp,6,1.0,27.408706
116,did:plc:cu5jlrl6jjkcj5hsks7cik5d,europesays.com,6,1.0,19.933555
360,did:plc:uoe6dg7fgmzxuuoka3mcudyj,energy-charts.info,8,1.0,18.700767
224,did:plc:kkdpuqm6tvalztn4bwxp5vau,yayafa.com,5,1.0,17.584994


In [None]:
# Cell 4: Link Burst Analysis

# 1. Count posts per URL
url_counts = df['url'].value_counts().reset_index()
url_counts.columns = ['url', 'count']

# 2. Filter for frequent URLs (>= 5 posts)
frequent_urls = url_counts[url_counts['count'] >= 5]['url'].tolist()
burst_df = df[df['url'].isin(frequent_urls)].copy()

# 3. Calculate time gaps
burst_df = burst_df.sort_values(['url', 'timestamp'])
burst_df['prev_timestamp'] = burst_df.groupby('url')['timestamp'].shift(1)
burst_df['time_gap'] = (burst_df['timestamp'] - burst_df['prev_timestamp']).dt.total_seconds()

# 4. Aggregate stats per URL
url_stats = burst_df.groupby('url').agg(
    count=('timestamp', 'count'),
    min_gap=('time_gap', 'min'),
    avg_gap=('time_gap', 'mean'),
    std_gap=('time_gap', 'std')
).reset_index()

# 5. Identify "Burst" URLs (e.g., very short min gap)
# Sort by count descending, then by avg_gap ascending
url_stats = url_stats.sort_values(['count', 'avg_gap'], ascending=[False, True])

print("Top Frequent URLs and their Burstiness Stats:")
display(url_stats.head(20))

print("\nURLs with potential bursts (avg gap < 10 seconds):")
bursty_urls = url_stats[url_stats['min_gap'] < 10].sort_values('avg_gap')
display(bursty_urls.head(20))

Top Frequent URLs and their Burstiness Stats:


Unnamed: 0,url,count,min_gap,avg_gap,std_gap
101,https://www.radiofrance.fr/fip,91,0.001,95.970022,237.802537
46,https://vydeo.space/FreeCams,74,8.223864,117.923766,230.526327
47,https://vydeo.space/models,72,12.416133,121.128399,211.048844
70,https://www.kbradio.online,63,0.0,141.387097,285.110928
11,https://bvf.wtf,44,0.380592,440.936097,1555.93223
27,https://oakgroveradio.com/player,33,118.620045,247.511486,356.668581
37,https://radiotempete.com/,33,61.577,266.250406,383.366525
39,https://streaming.shoutcast.com/tiorr3,29,175.0,308.571429,376.065639
36,https://radiofonico.it,29,146.042402,314.128246,363.396408
48,https://vydeo.space/stripchat,28,190.010264,322.046121,408.354044



URLs with potential bursts (avg gap < 10 seconds):


Unnamed: 0,url,count,min_gap,avg_gap,std_gap
137,www.zisshit.com/artshit/p/sa...,6,0.001,0.0016,0.000894
127,www.energy-charts.info,8,3.154144,3.6582,0.431804
129,www.goodrobe.fr/robe-de-mari...,9,8.88,11.002125,1.10282
101,https://www.radiofrance.fr/fip,91,0.001,95.970022,237.802537
46,https://vydeo.space/FreeCams,74,8.223864,117.923766,230.526327
70,https://www.kbradio.online,63,0.0,141.387097,285.110928
86,https://www.nytimes.com/2025/11/20/us/politics...,16,3.373,150.397533,156.691758
106,https://www.rawstory.com/trump-gop-rifts-spurn...,8,5.019,219.555857,224.707727
12,https://elopuck.pages.dev/,5,5.915177,278.893318,542.093786
10,https://bsky.pkmntcg.deals,25,2.711716,333.624892,836.811161
