In [2]:
import pandas as pd
import glob
import requests
from zipfile import ZipFile
import os



In [3]:
# Download ALL the data 
BASE_URL = "https://ton.twimg.com/birdwatch-public-data/2025/03/23"

def download_file(url, file_path):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"✅ Downloaded: {file_path}")
    else:
        print(f"❌ Failed to download: {url}")

# Download our Community Notes files
download_file(f"{BASE_URL}/noteStatusHistory/noteStatusHistory-00000.zip", 'data/noteStatusHistory-00000.zip')
download_file(f"{BASE_URL}/notes/notes-00000.zip", 'data/notes-00000.zip')
for i in range(16):
    download_file(f"{BASE_URL}/noteRatings/ratings-{str(i).zfill(5)}.zip", f'data/ratings-{str(i).zfill(5)}.zip')

✅ Downloaded: data/noteStatusHistory-00000.zip
✅ Downloaded: data/notes-00000.zip
✅ Downloaded: data/ratings-00000.zip
✅ Downloaded: data/ratings-00001.zip
✅ Downloaded: data/ratings-00002.zip
✅ Downloaded: data/ratings-00003.zip
✅ Downloaded: data/ratings-00004.zip
✅ Downloaded: data/ratings-00005.zip
✅ Downloaded: data/ratings-00006.zip
✅ Downloaded: data/ratings-00007.zip
✅ Downloaded: data/ratings-00008.zip
✅ Downloaded: data/ratings-00009.zip
✅ Downloaded: data/ratings-00010.zip
✅ Downloaded: data/ratings-00011.zip
✅ Downloaded: data/ratings-00012.zip
✅ Downloaded: data/ratings-00013.zip
✅ Downloaded: data/ratings-00014.zip
✅ Downloaded: data/ratings-00015.zip


In [4]:
# Unzip everything (compression is new in Community Notes land!)
for file in os.listdir('data/'):
    if file.endswith(".zip"):
        with ZipFile(f'data/{file}', 'r') as zip_ref:
            zip_ref.extractall('data/unzipped')

In [34]:
# Combine, then filter rating data for the week we're looking at 
# (why do they not put ratings sequentially? who knows)

ratings_files = glob.glob('data/unzipped/ratings-*.tsv')

ratings_chunks = []
chunk_size = 500000
start_date = pd.to_datetime('2025-03-09')
end_date = pd.to_datetime('2025-03-16')

for idx, file in enumerate(ratings_files):
    print(f"Loading file {idx + 1} of {len(ratings_files)}: {file}")
    for chunk in pd.read_csv(file, sep='\t', chunksize=chunk_size):
        chunk['createdAt'] = pd.to_datetime(chunk['createdAtMillis'], unit='ms')
        filtered_chunk = chunk[(chunk['createdAt'] >= start_date) & (chunk['createdAt'] < end_date)]
        ratings_chunks.append(filtered_chunk[['noteId', 'helpfulnessLevel', 'createdAt']])

all_ratings = pd.concat(ratings_chunks, ignore_index=True)

Loading file 1 of 16: data/unzipped/ratings-00003.tsv
Loading file 2 of 16: data/unzipped/ratings-00002.tsv
Loading file 3 of 16: data/unzipped/ratings-00014.tsv
Loading file 4 of 16: data/unzipped/ratings-00000.tsv
Loading file 5 of 16: data/unzipped/ratings-00001.tsv
Loading file 6 of 16: data/unzipped/ratings-00015.tsv
Loading file 7 of 16: data/unzipped/ratings-00011.tsv
Loading file 8 of 16: data/unzipped/ratings-00005.tsv
Loading file 9 of 16: data/unzipped/ratings-00004.tsv
Loading file 10 of 16: data/unzipped/ratings-00010.tsv
Loading file 11 of 16: data/unzipped/ratings-00006.tsv
Loading file 12 of 16: data/unzipped/ratings-00012.tsv
Loading file 13 of 16: data/unzipped/ratings-00013.tsv
Loading file 14 of 16: data/unzipped/ratings-00007.tsv
Loading file 15 of 16: data/unzipped/ratings-00009.tsv
Loading file 16 of 16: data/unzipped/ratings-00008.tsv


In [35]:
# Check it out
all_ratings.head()

Unnamed: 0,noteId,helpfulnessLevel,createdAt
0,1900895952786886701,HELPFUL,2025-03-15 20:55:22.664
1,1899954815591211432,NOT_HELPFUL,2025-03-13 00:44:37.491
2,1899957698583789971,NOT_HELPFUL,2025-03-13 00:44:09.902
3,1898547012452090169,HELPFUL,2025-03-09 09:40:35.059
4,1898578866417057827,HELPFUL,2025-03-09 09:40:23.357


In [19]:
# Group by NoteID so we can track how many votes each note got
ratings_summary = all_ratings.groupby('noteId')['helpfulnessLevel'].value_counts().unstack(fill_value=0).reset_index()
ratings_summary['total_helpful'] = ratings_summary.get('HELPFUL', 0)
ratings_summary['total_unhelpful'] = ratings_summary.get('NOT_HELPFUL', 0)

ratings_summary['helpfulness_ratio'] = ratings_summary['total_helpful'] / (
    ratings_summary['total_helpful'] + ratings_summary['total_unhelpful']
)
ratings_summary['helpfulness_ratio'] = ratings_summary['helpfulness_ratio'].fillna(0)

In [23]:
# Check it out
ratings_summary.head()

helpfulnessLevel,noteId,HELPFUL,NOT_HELPFUL,SOMEWHAT_HELPFUL,total_helpful,total_unhelpful,helpfulness_ratio
0,1361895220124798976,1,0,0,1,0,1.0
1,1397916575831711746,0,1,0,0,1,0.0
2,1398647731468521484,2,0,0,2,0,1.0
3,1399832502055981063,0,12,0,0,12,0.0
4,1399833622090240007,1,7,0,1,7,0.125


In [25]:
# Now read in our notes and status data (you may get a warning)
notes = pd.read_csv('data/unzipped/notes-00000.tsv', sep='\t')
notes_status = pd.read_csv('data/unzipped/noteStatusHistory-00000.tsv', sep='\t')

  notes = pd.read_csv('data/unzipped/notes-00000.tsv', sep='\t')
  notes_status = pd.read_csv('data/unzipped/noteStatusHistory-00000.tsv', sep='\t')


In [36]:
# Put into human time
notes['createdAt'] = pd.to_datetime(notes['createdAtMillis'], unit='ms')

# Filter notes for the week we're looking at
filtered_notes = notes[(notes['createdAt'] >= start_date) & (notes['createdAt'] < end_date)]

In [40]:
# Convert timestamps for status history
notes_status['currentStatusDate'] = pd.to_datetime(notes_status['timestampMillisOfCurrentStatus'], unit='ms')

# Filter noteStatusHistory for the desired timeframe
filtered_note_status = notes_status[
    (notes_status['currentStatusDate'] >= start_date) &
    (notes_status['currentStatusDate'] < end_date)
]

In [41]:
# Merge everything
merged_data = pd.merge(ratings_summary, notes, on='noteId', how='inner')
merged_data = pd.merge(merged_data, notes_status, on='noteId', how='inner')

In [42]:
# Check it out
print(merged_data.head())

                noteId  HELPFUL  NOT_HELPFUL  SOMEWHAT_HELPFUL  total_helpful  \
0  1361895220124798976        1            0                 0              1   
1  1397916575831711746        0            1                 0              0   
2  1398647731468521484        2            0                 0              2   
3  1399832502055981063        0           12                 0              0   
4  1399833622090240007        1            7                 0              1   
5  1399864843994337287        3            9                 0              3   
6  1400120391591358464        2            9                 0              2   
7  1415004640479961091        2            0                 0              2   
8  1415149802099249153        1            0                 1              1   
9  1415160673697505281        0            3                 0              0   

   total_unhelpful  helpfulness_ratio  \
0                0           1.000000   
1                1        

In [44]:
#Check it out
all_ratings.head()

Unnamed: 0,noteId,helpfulnessLevel,createdAt
0,1900895952786886701,HELPFUL,2025-03-15 20:55:22.664
1,1899954815591211432,NOT_HELPFUL,2025-03-13 00:44:37.491
2,1899957698583789971,NOT_HELPFUL,2025-03-13 00:44:09.902
3,1898547012452090169,HELPFUL,2025-03-09 09:40:35.059
4,1898578866417057827,HELPFUL,2025-03-09 09:40:23.357


In [53]:
# Calculate the % of helpful notes, and also check out the ratio of helpful votes on them
helpful_notes = merged_data[merged_data['currentStatus'] == 'CURRENTLY_RATED_HELPFUL']

percent_helpful_notes = (merged_data['currentStatus'] == 'CURRENTLY_RATED_HELPFUL').mean() * 100
print(f"Percentage of Helpful Notes: {percent_helpful_notes:.2f}%")

average_helpfulness_ratio = helpful_notes['helpfulness_ratio'].mean()
print(f"Average Helpfulness Ratio for Helpful Notes: {average_helpfulness_ratio:.2f}")


Percentage of Helpful Notes: 9.09%
Average Helpfulness Ratio for Helpful Notes: 0.85


In [52]:
# Now let's add in tweets from Trump and Musk
tracked_tweet_ids = [1899636898533867969]  # Replace with your list of tracked tweet IDs

# Filter dataset for notes on tracked tweets
tracked_notes = merged_data[merged_data['tweetId'].isin(tracked_tweet_ids)]

# Calculate helpfulness ratio for tracked notes
tracked_helpfulness_ratio = tracked_notes['helpfulness_ratio'].mean()
print(f"Helpfulness Ratio for Tracked Notes: {tracked_helpfulness_ratio:.2f}")

# Calculate % of helpful notes on tracked tweets (lol probably 0!)
percent_helpful_tracked_notes = (tracked_notes['currentStatus'] == 'CURRENTLY_RATED_HELPFUL').mean() * 100
print(f"Percentage of Helpful Notes on Tracked Tweets: {percent_helpful_tracked_notes:.2f}%")
# Now print the same numbers for overall dataset from that week
print(f"Average Helpfulness Ratio for Helpful Notes: {average_helpfulness_ratio:.2f}")
percent_helpful_notes = (merged_data['currentStatus'] == 'CURRENTLY_RATED_HELPFUL').mean() * 100
print(f"Percentage of Helpful Notes overall: {percent_helpful_notes:.2f}%")

Helpfulness Ratio for Tracked Notes: 0.39
Percentage of Helpful Notes on Tracked Tweets: 0.00%
Average Helpfulness Ratio for Helpful Notes: 0.85
Percentage of Helpful Notes overall: 9.09%
