In [198]:
import pandas as pd
import numpy as np
import igraph as ig
from sklearn.metrics import ndcg_score

In [199]:
file_path = 'COVID19_sample.csv'
print(f"Loading dataset from {file_path}...")
df = pd.read_csv(file_path)
        
initial_count = len(df)
df = df[df['user_id'] != df['retweeted_user_id']]
removed_self_reposts = initial_count - len(df)
    
if removed_self_reposts > 0:
    print(f"Removed {removed_self_reposts} self-reposts")

df['created_at'] = pd.to_datetime(df['created_at'])
    
# Data quality checks
print(f"\nDATASET SUMMARY:")
print(f" • Total records: {len(df):,}")
print(f" • Unique users: {df['user_id'].nunique():,}")
print(f" • Unique tweets: {df['tweet_id'].nunique():,}")
print(f" • Date range: {df['created_at'].min()} to {df['created_at'].max()}")
print(f" • Credibility score range: {df['credibility_score'].min():.1f} - {df['credibility_score'].max():.1f}")
print(f" • Average credibility: {df['credibility_score'].mean():.2f}")

Loading dataset from COVID19_sample.csv...
Removed 240 self-reposts

DATASET SUMMARY:
 • Total records: 19,760
 • Unique users: 8,319
 • Unique tweets: 19,760
 • Date range: 2020-12-20 01:25:21+00:00 to 2021-01-04 13:25:39+00:00
 • Credibility score range: 0.0 - 100.0
 • Average credibility: 62.42


In [200]:
# Metric 1: Basic retweet count
retweet_counts = df['retweeted_user_id'].value_counts().reset_index()
retweet_counts.columns = ['user_id', 'retweet_count']

# Metric 2: Credibility-weighted influence
cred_weighted = df.groupby('retweeted_user_id')['credibility_score'].sum().reset_index()
cred_weighted.columns = ['user_id', 'cred_weighted_influence']

# Metric 3: Temporal influence (weight recent retweets more)
max_time = df['created_at'].max()
time_diff = (max_time - df['created_at']).dt.total_seconds() / 3600  # hours since retweet
time_weight = np.exp(-time_diff / 24)  # exponential decay with 24-hour half-life
df['time_weight'] = time_weight

temporal_influence = df.groupby('retweeted_user_id')['time_weight'].sum().reset_index()
temporal_influence.columns = ['user_id', 'temporal_influence']

# Metric 4: Network influence (using PageRank)
# Build retweet network
edges = zip(df['user_id'], df['retweeted_user_id'])
g = ig.Graph.TupleList(edges, directed=True)

# Calculate PageRank
pagerank = g.pagerank(directed=True, weights=None)
network_influence = pd.DataFrame({
    'user_id': g.vs['name'],
    'network_influence': pagerank
})


# Combine all metrics
influence_df = retweet_counts.merge(
    cred_weighted, on='user_id', how='outer'
).merge(
    temporal_influence, on='user_id', how='outer'
).merge(
    network_influence, on='user_id', how='outer'
).fillna(0)

# Normalize metrics
metrics = ['retweet_count', 'cred_weighted_influence', 'temporal_influence', 'network_influence']
influence_df[metrics] = influence_df[metrics] / influence_df[metrics].max()

# Combined influence score (weighted sum)
weights = {
    'retweet_count': 0.3,
    'cred_weighted_influence': 0.4,
    'temporal_influence': 0.2,
    'network_influence': 0.1
}

influence_df['combined_influence'] = sum(
    influence_df[metric] * weight for metric, weight in weights.items()
)

# Get top influential users
top_influential = influence_df.sort_values('combined_influence', ascending=False)
top_influential.head(10)

Unnamed: 0,user_id,retweet_count,cred_weighted_influence,temporal_influence,network_influence,combined_influence
1517,331617619,0.975024,1.0,0.248317,0.637716,0.805942
186,25676606,0.829011,0.772951,0.109434,0.8715,0.66692
815,133790890,0.436119,0.501508,0.650929,0.800316,0.541656
7153,1063806444380798976,0.513929,0.369787,1.0,0.346822,0.536775
4101,1683455144,1.0,0.062159,0.511794,0.712653,0.498488
4409,2308107619,0.464938,0.414241,0.025379,0.580436,0.368297
3369,1032615842,0.416907,0.169912,0.124491,1.0,0.317935
63,13514762,0.289145,0.215298,0.166254,0.298736,0.235987
5596,4572998716,0.349664,0.231891,0.009463,0.293512,0.2289
5573,4394960301,0.292988,0.163618,0.233442,0.238269,0.223859


In [201]:
# Baseline: Rank by retweet count
baseline_ranking = retweet_counts.sort_values('retweet_count', ascending=False)
# Normalize metric
baseline_ranking['retweet_count'] = baseline_ranking['retweet_count'] / baseline_ranking['retweet_count'].max()

In [202]:
# Prepare data for evaluation
eval_users = list(set(top_influential['user_id']) & set(baseline_ranking['user_id']))

# Get rankings
influence_rank = top_influential[top_influential['user_id'].isin(eval_users)].copy()
baseline_rank = baseline_ranking[baseline_ranking['user_id'].isin(eval_users)].copy()

# Create user to rank mapping
influence_rank['rank'] = range(1, len(influence_rank)+1)
baseline_rank['rank'] = range(1, len(baseline_rank)+1)

# Merge to align users
combined = influence_rank.merge(
    baseline_rank[['user_id', 'retweet_count']],
    on='user_id',
    how='outer',
    suffixes=('_influence', '_baseline')
)

combined

Unnamed: 0,user_id,retweet_count_influence,cred_weighted_influence,temporal_influence,network_influence,combined_influence,rank,retweet_count_baseline
0,607983,0.000961,0.000896,1.343851e-08,0.006227,0.001269,766,0.000961
1,636143,0.000961,0.001045,3.388725e-04,0.002764,0.001050,877,0.000961
2,647043,0.000961,0.000806,7.506308e-07,0.003589,0.000970,1011,0.000961
3,1679201,0.000961,0.001194,7.059772e-07,0.002490,0.001015,948,0.000961
4,5378612,0.000961,0.000985,6.059056e-07,0.003589,0.001041,900,0.000961
...,...,...,...,...,...,...,...,...
1230,1339668963631001602,0.005764,0.006807,1.099446e-05,0.009773,0.005431,319,0.005764
1231,1340737374666563584,0.001921,0.001970,1.313089e-05,0.003919,0.001759,652,0.001921
1232,1354657898840907777,0.026897,0.000000,7.868700e-04,0.040559,0.012282,177,0.026897
1233,1380965912241709056,0.097983,0.069431,7.575638e-04,0.085053,0.065824,54,0.097983


In [210]:
# For our system: use combined_influence as predicted relevance
# For baseline: use retweet_count as true relevance
true_relevance = combined['retweet_count_baseline'].values.reshape(1, -1)
predicted_relevance = combined['combined_influence'].values.reshape(1, -1)

k = 10  # Top k for evaluation

# Calculate Evaluation metrics
ndcg = ndcg_score(true_relevance, predicted_relevance, k=k)

# Print evaluation results
print("EVALUATION RESULTS:")
print(f" • NDCG@{k}: {ndcg:.4f}")

EVALUATION RESULTS:
 • NDCG@10: 0.9677


In [204]:
# Analyze differences
print("\nTop users by our system:")
display(top_influential[['user_id', 'combined_influence']].head(10))

print("\nTop users by baseline:")
display(baseline_ranking[['user_id', 'retweet_count']].head(10))

# Examine why certain users rank differently
sample_user = top_influential.iloc[0]['user_id']
user_data = df[df['retweeted_user_id'] == sample_user]

print(f"\nAnalysis for user {sample_user}:")
print(f"- Number of retweets: {len(user_data)}")
print(f"- Average credibility: {user_data['credibility_score'].mean():.1f}")


Top users by our system:


Unnamed: 0,user_id,combined_influence
1517,331617619,0.805942
186,25676606,0.66692
815,133790890,0.541656
7153,1063806444380798976,0.536775
4101,1683455144,0.498488
4409,2308107619,0.368297
3369,1032615842,0.317935
63,13514762,0.235987
5596,4572998716,0.2289
5573,4394960301,0.223859



Top users by baseline:


Unnamed: 0,user_id,retweet_count
0,1683455144,1.0
1,331617619,0.975024
2,25676606,0.829011
3,1063806444380798976,0.513929
4,2308107619,0.464938
5,133790890,0.436119
6,1032615842,0.416907
7,4572998716,0.349664
8,4394960301,0.292988
9,13514762,0.289145



Analysis for user 331617619.0:
- Number of retweets: 1015
- Average credibility: 82.5
