In [3]:
import pandas as pd
import os
import glob
from zipfile import ZipFile
import requests

# Display settings
pd.set_option('display.max_columns', None)



In [4]:
base_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(base_dir, 'data/unzipped')
weekly_dir = os.path.join(base_dir, 'notebooks/weekly_data')

# Time windows for comparison
periods = {
    'recent': ('2025-03-11', '2025-03-25'),
    'earlier': ('2025-02-03', '2025-02-14')
}

In [14]:
def load_filtered_ratings(start_date, end_date):
    import pandas as pd
    import os
    import glob

    ratings_files = glob.glob(os.path.join(data_dir, 'ratings-*.tsv'))
    chunk_size = 500000
    chunks = []

    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)

    for file in ratings_files:
        for chunk in pd.read_csv(file, sep='\t', chunksize=chunk_size, on_bad_lines='skip'):
            chunk['createdAt'] = pd.to_datetime(chunk['createdAtMillis'], unit='ms')
            filtered = chunk[(chunk['createdAt'] >= start) & (chunk['createdAt'] < end)]
            chunks.append(filtered[['noteId', 'helpfulnessLevel', 'createdAt']])

    return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()

In [15]:
ratings = load_filtered_ratings('2025-03-11', '2025-03-25')
ratings.head()

Unnamed: 0,noteId,helpfulnessLevel,createdAt
0,1902560045990343054,HELPFUL,2025-03-20 22:49:04.594
1,1902625408547250205,HELPFUL,2025-03-20 22:48:53.415
2,1902626097939780059,NOT_HELPFUL,2025-03-20 22:50:18.137
3,1902691773211668936,HELPFUL,2025-03-20 22:49:33.602
4,1902693723705954772,HELPFUL,2025-03-20 22:49:25.459


In [16]:
def process_period(label, start, end):
    print(f"📅 Processing: {label} — {start} to {end}")
    
    ratings = load_filtered_ratings(start, end)
    summary = ratings.groupby('noteId')['helpfulnessLevel'].value_counts().unstack(fill_value=0).reset_index()
    summary['total_helpful'] = summary.get('HELPFUL', 0)
    summary['total_unhelpful'] = summary.get('NOT_HELPFUL', 0)
    summary['helpfulness_ratio'] = summary['total_helpful'] / (summary['total_helpful'] + summary['total_unhelpful'])
    summary['helpfulness_ratio'] = summary['helpfulness_ratio'].fillna(0)
    
    merged = pd.merge(summary, notes, on='noteId', how='left')
    merged = pd.merge(merged, note_status, on='noteId', how='left')
    
    return merged|

In [18]:
recent_df = process_period('recent', *periods['recent'])
earlier_df = process_period('earlier', *periods['earlier'])

recent_df.head()

📅 Processing: recent — 2025-03-11 to 2025-03-25
📅 Processing: earlier — 2025-02-03 to 2025-02-14


Unnamed: 0,noteId,HELPFUL,NOT_HELPFUL,SOMEWHAT_HELPFUL,total_helpful,total_unhelpful,helpfulness_ratio,noteAuthorParticipantId_x,createdAtMillis_x,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,misleadingManipulatedMedia,misleadingOutdatedInformation,misleadingMissingImportantContext,misleadingUnverifiedClaimAsFact,misleadingSatire,notMisleadingOther,notMisleadingFactuallyCorrect,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,createdAt,noteAuthorParticipantId_y,createdAtMillis_y,timestampMillisOfFirstNonNMRStatus,firstNonNMRStatus,timestampMillisOfCurrentStatus,currentStatus,timestampMillisOfLatestNonNMRStatus,mostRecentNonNMRStatus,timestampMillisOfStatusLock,lockedStatus,timestampMillisOfRetroLock,currentCoreStatus,currentExpansionStatus,currentGroupStatus,currentDecidedBy,currentModelingGroup,timestampMillisOfMostRecentStatusChange,timestampMillisOfNmrDueToMinStableCrhTime,currentMultiGroupStatus,currentModelingMultiGroup,timestampMinuteOfFinalScoringOutput,timestampMillisOfFirstNmrDueToMinStableCrhTime,currentStatusDate
0,1360423506052341764,0,1,0,0,1,0.0,B665A1E04CEE3FB159A361D249E3AA38FFE1485D720894...,1613185000000.0,1360007501928890377,NOT_MISLEADING,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,"As seen in this tweet, @jschlatt does like men...",0.0,2021-02-13 03:00:13.947,B665A1E04CEE3FB159A361D249E3AA38FFE1485D720894...,1613185000000.0,,,1742508000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,,,,ExpansionPlusModel (v1.1),,-1.0,-1.0,,,29046470.0,,2025-03-20 21:54:06.912
1,1361895220124798976,1,0,0,1,0,1.0,27DE7623F57E8902EEE6C756D8570F69980C0C55CF3713...,1613536000000.0,1361299181621702664,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_FEW,LITTLE_HARM,EASY,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,There is no green capybara. This is duckweed o...,0.0,2021-02-17 04:28:17.925,27DE7623F57E8902EEE6C756D8570F69980C0C55CF3713...,1613536000000.0,,,1742548000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,,,NEEDS_MORE_RATINGS,CoreModel (v1.1),13.0,-1.0,-1.0,,,29046470.0,,2025-03-21 09:09:25.948
2,1392149020177481735,0,2,0,0,2,0.0,35A394BD89ECB36EC7A7505A404E1E32F67DE14B122C29...,1620749000000.0,1391867466951835650,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,This is classic antisemitism/Jew hatred dresse...,0.0,2021-05-11 16:06:05.946,35A394BD89ECB36EC7A7505A404E1E32F67DE14B122C29...,1620749000000.0,,,1742785000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,CURRENTLY_RATED_NOT_HELPFUL,CURRENTLY_RATED_NOT_HELPFUL,,ScoringDriftGuard (v1.0),,-1.0,-1.0,,,29046470.0,,2025-03-24 02:54:09.250
3,1398647731468521484,1,0,0,1,0,1.0,26A9AD8FD5FDED46CBD2919A85F4AEAB96C4ABD08551A0...,1622299000000.0,1378893370421043204,NOT_MISLEADING,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,"The fact check is all about vaccines, but the ...",0.0,2021-05-29 14:29:39.459,26A9AD8FD5FDED46CBD2919A85F4AEAB96C4ABD08551A0...,1622299000000.0,,,1741947000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,,,,ExpansionPlusModel (v1.1),,-1.0,-1.0,,,29046470.0,,2025-03-14 10:09:29.512
4,1399832502055981063,0,13,0,0,13,0.0,81AA462BF30D87B3D719A81BA6492EF148F89D2F1E939E...,1622581000000.0,1399695342635991042,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antifa did not do this. This is not their MO. ...,0.0,2021-06-01 20:57:30.780,81AA462BF30D87B3D719A81BA6492EF148F89D2F1E939E...,1622581000000.0,,,1742089000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,CURRENTLY_RATED_NOT_HELPFUL,CURRENTLY_RATED_NOT_HELPFUL,CURRENTLY_RATED_NOT_HELPFUL,ScoringDriftGuard (v1.0),13.0,-1.0,-1.0,,,29046470.0,,2025-03-16 01:41:32.425


In [28]:
tracked_tweet_ids = [
    "1901441626116870156"
]

In [34]:
def calc_metrics(df, label, tracked_ids):
    # --- All notes ---
    all_ratio = df['helpfulness_ratio'].mean()
    notes_per_tweet = df.groupby('tweetId').size().mean()

    # --- Helpful notes only ---
    helpful_notes = df[df['currentStatus'] == 'CURRENTLY_RATED_HELPFUL']
    helpful_only_ratio = helpful_notes['helpfulness_ratio'].mean()

    # --- Tracked tweets: All notes ---
    tracked = df[df['tweetId'].isin(tracked_ids)]
    tracked_ratio_all = tracked['helpfulness_ratio'].mean() if not tracked.empty else 0
    tracked_notes_per_tweet = tracked.groupby('tweetId').size().mean() if not tracked.empty else 0

    # --- Tracked tweets: Helpful notes only ---
    tracked_helpful = tracked[tracked['currentStatus'] == 'CURRENTLY_RATED_HELPFUL']
    tracked_ratio_helpful = tracked_helpful['helpfulness_ratio'].mean() if not tracked_helpful.empty else 0

    return pd.DataFrame({
        'Period': [label],
        'Helpfulness Ratio (All Notes)': [round(all_ratio, 2)],
        'Helpfulness Ratio (Helpful Notes Only)': [round(helpful_only_ratio, 2)],
        'Notes per Tweet (All)': [round(notes_per_tweet, 2)],
        'Tracked Helpfulness Ratio (All Notes)': [round(tracked_ratio_all, 2)],
        'Tracked Helpfulness Ratio (Helpful Notes Only)': [round(tracked_ratio_helpful, 2)],
        'Notes per Tweet (Tracked)': [round(tracked_notes_per_tweet, 2)]
    })


In [35]:
recent_metrics = calc_metrics(recent_df, "Recent", tracked_tweet_ids)
earlier_metrics = calc_metrics(earlier_df, "Earlier", tracked_tweet_ids)

comparison_df = pd.concat([earlier_metrics, recent_metrics], ignore_index=True)

In [37]:
comparison_df.head()

Unnamed: 0,Period,Helpfulness Ratio (All Notes),Helpfulness Ratio (Helpful Notes Only),Notes per Tweet (All),Tracked Helpfulness Ratio (All Notes),Tracked Helpfulness Ratio (Helpful Notes Only),Notes per Tweet (Tracked)
0,Earlier,0.6,0.85,1.65,0.0,0,0.0
1,Recent,0.6,0.86,1.64,0.53,0,4.0
