In [1]:
# Load data
# Remove evaluation
# Get ground truth from evaluation
# Fit model on training
# Write rank comparison cost function
# Optimize weights with some method (naive: hill climbing)

In [2]:
import pandas as pd
from model import HModel

In [3]:
# Load retweets (observation period)
dtype = {'tweet_id': str,
         'user_id': str,
         'retweeted_user_id': str,
         'retweeted_status_id': str}

observation_df = pd.read_csv("data/observation_retweets.csv", parse_dates=[1], dtype=dtype)
observation_df

Unnamed: 0,tweet_id,created_at,user_id,retweeted_user_id,retweeted_status_id,retweet_count,likes_count,retweeted_text,root_domains,newsguard_rating
0,1340468299025551360,2020-12-20 01:25:21+00:00,497188910,1017807360075665408,1340325850378592257,4,0,@GagliardoneS @amnesia96225614 https://t.co/L3...,affaritaliani.it,64.5
1,1340468728534884354,2020-12-20 01:27:04+00:00,924336025387913221,,,0,0,,lastampa.it,95.0
2,1340473042129080320,2020-12-20 01:44:12+00:00,47148805,,,0,0,,nytimes.com,100.0
3,1340474125656190978,2020-12-20 01:48:31+00:00,1022891525242593280,,,1,3,,imolaoggi.it,5.0
4,1340477947627581440,2020-12-20 02:03:42+00:00,908206586,,,0,0,,repubblica.it tweetedtimes.com,95.0
...,...,...,...,...,...,...,...,...,...,...
164525,1366174375318274048,2021-02-28 23:52:08+00:00,2647427506,454423746,1366062410277208065,0,0,"Il ""percorso preferenziale"" per i giornalisti ...",gonews.it,95.0
164526,1366174768962162691,2021-02-28 23:53:41+00:00,268452474,,,0,0,,ilmessaggero.it,95.0
164527,1366175279174086659,2021-02-28 23:55:43+00:00,1262358617803587585,,,0,0,,informazione.it,70.0
164528,1366175279635390466,2021-02-28 23:55:43+00:00,1262358617803587585,,,0,0,,informazione.it,70.0


In [4]:
model_input_df = observation_df[['tweet_id', 'user_id', 'retweeted_status_id', 'created_at', 'newsguard_rating']].copy()
model_input_df.newsguard_rating = (model_input_df.newsguard_rating <= 45.0).astype(int)
model_input_df.retweeted_status_id.fillna('ORIGIN', inplace=True)
model_input_df

Unnamed: 0,tweet_id,user_id,retweeted_status_id,created_at,newsguard_rating
0,1340468299025551360,497188910,1340325850378592257,2020-12-20 01:25:21+00:00,0
1,1340468728534884354,924336025387913221,ORIGIN,2020-12-20 01:27:04+00:00,0
2,1340473042129080320,47148805,ORIGIN,2020-12-20 01:44:12+00:00,0
3,1340474125656190978,1022891525242593280,ORIGIN,2020-12-20 01:48:31+00:00,1
4,1340477947627581440,908206586,ORIGIN,2020-12-20 02:03:42+00:00,0
...,...,...,...,...,...
164525,1366174375318274048,2647427506,1366062410277208065,2021-02-28 23:52:08+00:00,0
164526,1366174768962162691,268452474,ORIGIN,2021-02-28 23:53:41+00:00,0
164527,1366175279174086659,1262358617803587585,ORIGIN,2021-02-28 23:55:43+00:00,0
164528,1366175279635390466,1262358617803587585,ORIGIN,2021-02-28 23:55:43+00:00,0


In [5]:
# Data splitting
cut_point = int(len(model_input_df)*0.8)
training_set = model_input_df[:cut_point]
evaluation_set = observation_df[cut_point:]

In [6]:
# Get the retweet network for ground truth
def get_retweet_network(retweets_df, low_cred_thr=None):
    
    features = ["retweeted_user_id", "user_id", "newsguard_rating"]
    edge_list_df = retweets_df[features].copy()
    
    if low_cred_thr:
        # Remove rows with no low credibility
        low_credibility_index = edge_list_df[edge_list_df.newsguard_rating > low_cred_thr].index
        edge_list_df.drop(low_credibility_index, inplace=True)
    
    edge_list_df = edge_list_df.groupby(["retweeted_user_id", "user_id"]).count()
    edge_list_df.sort_values(by="newsguard_rating", ascending=False, inplace=True)
    edge_list_df.reset_index(inplace=True)
    
    edge_list_df.rename(columns = {"retweeted_user_id": "source",
                                   "user_id": "target",
                                   "newsguard_rating": "weight"}, inplace=True)
    
    return edge_list_df

evaluation_net_df = get_retweet_network(evaluation_set, low_cred_thr=45.0)

In [7]:
# Get the optimal rank (ground truth)
optimal_rank_df = evaluation_net_df[['source', 'weight']].groupby('source').sum()
optimal_rank_df = optimal_rank_df.sort_values(by='weight', ascending=False)
optimal_rank_df.rename(columns={'weight': 'Optimal'}, inplace=True)
optimal_rank_df.index.rename('author_id', inplace=True)
optimal_rank_df

Unnamed: 0_level_0,Optimal
author_id,Unnamed: 1_level_1
1683455144,1042
1029307293928771584,281
1322629376421355521,275
1032081918,194
910827588,186
...,...
1342094226893500418,1
422412954,1
1343969883109916675,1
1348395776867762182,1


In [8]:
optimal_rank_dict = optimal_rank_df.to_dict()['Optimal']

In [9]:
optimal_rank_dict

{'1683455144': 1042,
 '1029307293928771584': 281,
 '1322629376421355521': 275,
 '1032081918': 194,
 '910827588': 186,
 '245969509': 117,
 '1032615842': 108,
 '117701249': 102,
 '4758512368': 100,
 '1248216384577953792': 85,
 '326150500': 75,
 '500882938': 50,
 '2391603191': 49,
 '1333884982268473345': 46,
 '1006071437168390144': 41,
 '1940288287': 37,
 '726721856749842436': 37,
 '1245225642003660800': 36,
 '2439937320': 35,
 '775343030421291008': 30,
 '1024702799215509504': 30,
 '1388854795': 29,
 '4257129537': 28,
 '1159853921227198464': 27,
 '1935534786': 25,
 '274093178': 24,
 '1302370010728460299': 24,
 '250082635': 24,
 '51759517': 24,
 '1673695388': 23,
 '398117986': 23,
 '2286319188': 23,
 '1620487452': 22,
 '986811270': 21,
 '1262507047582564352': 18,
 '221681749': 17,
 '503223395': 17,
 '1343955493212065794': 16,
 '896048840947765249': 16,
 '107517248': 15,
 '332885533': 11,
 '3250047701': 10,
 '454365633': 9,
 '48484178': 9,
 '423474251': 9,
 '1325851962756263936': 8,
 '38967

### Model training

In [10]:
model = HModel()

model.fit(list(training_set.itertuples(index=False)),
          content_key=lambda x: x[0],
          author_key=lambda x: x[1],
          root_content_key=lambda x: x[2],
          timestamp_key=lambda x: x[3],
          misinf_key=lambda x: x[4])

### Rank fitness function

In [11]:
import rbo

def rbo_fit(test_rank, true_rank, sampling=0.5):

    # Rank intersection
    true_known = {k: v for k, v in true_sampled if k in test_rank}
    test_known = {k: v for k, v in test_sampled if k in true_known}

    # Rank sampling
    #true_sampled = [x for x in true_known]
    
    # 'A similarity measure for indefinite rankings' (Rank-Biased Overlap)
    return rbo.RankingSimilarity(list(test_known.keys()), list(true_known.keys())).rbo()

In [12]:
# Rank cost test
dtype = {'author_id': str}
popularity_rank_df = pd.read_csv('data/popularity_rank.csv', index_col='author_id', dtype=dtype)
popularity_rank_dict = popularity_rank_df.to_dict()['Popularity']
influence_rank_df = pd.read_csv('data/influence_rank.csv', index_col='author_id', dtype=dtype)
influence_rank_dict = influence_rank_df.to_dict()['Influence']
fib_rank_df = pd.read_csv('data/fib_rank.csv', index_col='author_id', dtype=dtype)
fib_rank_dict = fib_rank_df.to_dict()['FIB-i']
retweets_rank_df = pd.read_csv('data/retweets_rank.csv', index_col='author_id', dtype=dtype)
fraction_rank_df = pd.read_csv('data/fraction_rank.csv', index_col='author_id', dtype=dtype)
compound_rank_df = pd.read_csv('data/compound_rank.csv', index_col='author_id', dtype=dtype)

In [13]:
def normalize_rank(rank):
    
    norm_rank = dict()
    prev = None
    n = 0
    for k, v in rank.items():
        
        norm_rank[k] = n
        
        if n == 0 or v != prev:
            n += 1

        prev = v
    return norm_rank


def rn_cost(test_rank, true_rank):

    # Rank intersection
    true_known = normalize_rank({k: v for k, v in true_rank.items() if k in test_rank})
    test_known = normalize_rank({k: v for k, v in test_rank.items() if k in true_known})

    cost = 0

    for k, v in test_known.items():
        cost += abs(true_known[k] - v)

    return cost
    
        

#normalize_rank(popularity_rank_dict)
print(rn_cost(popularity_rank_dict, optimal_rank_dict))
print(rn_cost(influence_rank_dict, optimal_rank_dict))
print(rn_cost(fib_rank_dict, optimal_rank_dict))
print(rn_cost(fib_rank_dict, fib_rank_dict))

12770
4377
4550
0


In [14]:
#popularity_rank_dict

In [15]:
#optimal_rank_dict

In [16]:
#rbo_fit(fib_rank_dict, optimal_rank_dict)

In [17]:
#rbo_fit(influence_rank_dict, optimal_rank_dict)

In [18]:
#rbo_fit(popularity_rank_dict, optimal_rank_dict)

In [19]:
#rbo_fit(optimal_rank_dict, optimal_rank_dict)

In [20]:
#import random as rnd
#random_rank_dict = {k: rnd.randint(0, 1000) for k, _ in influence_rank_dict.items()}
#rbo_fit(random_rank_dict, optimal_rank_dict)

### Naive optimization

In [21]:
import numpy as np
import random as rnd

model.weights = np.random.rand(len(model.weights))
#model.weights /= np.linalg.norm(model.weights

for n in range(1000):
    
    current_weights = model.weights.copy()
    current_fit = rn_cost(model.get_rank(), optimal_rank_dict) # rbo_fit(model.get_rank(), optimal_rank_dict)

    for _ in range(3):
        #model.weights /= np.linalg.norm(model.weights)
        
        r_index = rnd.randint(0, len(current_weights) - 1)
        r_shift = (rnd.random() * 2.0) - 1.0
        
        model.weights[r_index] = model.weights[r_index] + r_shift #((model.weights[r_index] + r_shift) % 2) - 1
        #model.weights /= np.linalg.norm(model.weights)
        #model.weights *= np.linalg.norm(model.weights)
    
    new_fit = rn_cost(model.get_rank(), optimal_rank_dict) #rbo_fit(model.get_rank(), optimal_rank_dict)
    
    # if new_fit < current_fit:
    if new_fit >= current_fit:
        model.weights = current_weights
    else:
        print('Improvement:', current_fit, '->', new_fit)


Improvement: 7904 -> 7893
Improvement: 7893 -> 7884
Improvement: 7884 -> 7854
Improvement: 7854 -> 7836
Improvement: 7836 -> 7390
Improvement: 7390 -> 7085
Improvement: 7085 -> 7053
Improvement: 7053 -> 6940
Improvement: 6940 -> 6872
Improvement: 6872 -> 6829
Improvement: 6829 -> 6820
Improvement: 6820 -> 6813
Improvement: 6813 -> 6792
Improvement: 6792 -> 6771
Improvement: 6771 -> 6763
Improvement: 6763 -> 6749
Improvement: 6749 -> 6744
Improvement: 6744 -> 6652
Improvement: 6652 -> 6646
Improvement: 6646 -> 6645
Improvement: 6645 -> 6628
Improvement: 6628 -> 6621
Improvement: 6621 -> 6620
Improvement: 6620 -> 6604
Improvement: 6604 -> 6585
Improvement: 6585 -> 6581
Improvement: 6581 -> 6540
Improvement: 6540 -> 6536
Improvement: 6536 -> 6528
Improvement: 6528 -> 6502
Improvement: 6502 -> 6501
Improvement: 6501 -> 6497
Improvement: 6497 -> 6482
Improvement: 6482 -> 6477
Improvement: 6477 -> 6461
Improvement: 6461 -> 6453
Improvement: 6453 -> 6445
Improvement: 6445 -> 6440
Improvement:

In [22]:
model.weights

array([-2.6113076 , -5.63461515, -0.03953497, -0.79363539, -0.87632238,
       -0.43171447, -2.16989018,  1.26369219,  1.99328671])

In [26]:
# Saving the optimized rank
#lin_rank_df = pd.DataFrame(list(model.get_rank().items()), columns=['author_id', 'Linear'])
#lin_rank_df.set_index('author_id', inplace=True)
#lin_rank_df = lin_rank_df.sort_values(by='Linear', ascending=False)

In [27]:
#lin_rank_df

In [28]:
#lin_rank_df.to_csv('data/linear_rank.csv')

### Genetic

In [None]:
# Create a population
# Select
# Crossover
# Evaluate