# Ranking accounts

In [1]:
import pandas as pd

from model import HModel

## Observation data (Training Set)

In [2]:
# Load retweets (observation period)
dtype = {'tweet_id': str,
         'user_id': str,
         'retweeted_user_id': str,
         'retweeted_status_id': str}

observation_df = pd.read_csv("data/observation_retweets.csv", parse_dates=[1], dtype=dtype)

In [3]:
# Show preview
observation_df

Unnamed: 0,tweet_id,created_at,user_id,retweeted_user_id,retweeted_status_id,retweet_count,likes_count,retweeted_text,root_domains,newsguard_rating
0,1340468299025551360,2020-12-20 01:25:21+00:00,497188910,1017807360075665408,1340325850378592257,4,0,@GagliardoneS @amnesia96225614 https://t.co/L3...,affaritaliani.it,64.5
1,1340468728534884354,2020-12-20 01:27:04+00:00,924336025387913221,,,0,0,,lastampa.it,95.0
2,1340473042129080320,2020-12-20 01:44:12+00:00,47148805,,,0,0,,nytimes.com,100.0
3,1340474125656190978,2020-12-20 01:48:31+00:00,1022891525242593280,,,1,3,,imolaoggi.it,5.0
4,1340477947627581440,2020-12-20 02:03:42+00:00,908206586,,,0,0,,repubblica.it tweetedtimes.com,95.0
...,...,...,...,...,...,...,...,...,...,...
164525,1366174375318274048,2021-02-28 23:52:08+00:00,2647427506,454423746,1366062410277208065,0,0,"Il ""percorso preferenziale"" per i giornalisti ...",gonews.it,95.0
164526,1366174768962162691,2021-02-28 23:53:41+00:00,268452474,,,0,0,,ilmessaggero.it,95.0
164527,1366175279174086659,2021-02-28 23:55:43+00:00,1262358617803587585,,,0,0,,informazione.it,70.0
164528,1366175279635390466,2021-02-28 23:55:43+00:00,1262358617803587585,,,0,0,,informazione.it,70.0


## Users metadata

In [4]:
# Load user metadata (used for Popularity metric)
# NOTE: Dtype are specified due a warning about 'mixed data'
dtype = {'user_id': str,
         'user_screen_name': str,
         'name': str,
         'verified': str,
         'protected': str,
         'location': str,
         'description': str}

user_data_df = pd.read_csv("data/final_users_new.csv", parse_dates=[1], dtype=dtype)

In [5]:
# Show preview
user_data_df

Unnamed: 0,user_id,created_at,user_screen_name,name,nbr_followers,nbr_followings,nbr_liked_tweets,nbr_lists,nbr_tweets,verified,protected,location,description,nbr_accounts,nbr_usernames
0,1338507058551943171,2020-12-14 15:32:19+00:00,natAfrica1,Nature Africa,325.0,0.0,0.0,1.0,0.0,False,False,,"Bringing you the latest science news, features...",1.0,1.0
1,380749300,2011-09-27 05:02:03+00:00,Apple,Apple,5821968.0,0.0,0.0,10925.0,0.0,True,False,"Cupertino, CA",http://Apple.com,1.0,1.0
2,1179147907188043776,2019-10-01 21:35:50+00:00,david25342,david2534,0.0,11.0,2.0,0.0,1.0,False,False,,,1.0,1.0
3,1340056522597019648,2020-12-18 22:09:17+00:00,AnnaLauraMasci1,Anna Laura Masciopinto,1.0,10.0,0.0,0.0,1.0,False,False,,,1.0,1.0
4,1408886954566037506,2021-06-26 20:37:08+00:00,NicolaG86861424,Nicola Gentile,0.0,5.0,0.0,0.0,1.0,False,False,,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166141,139103913,2010-05-01 15:11:38+00:00,HernanPorrasM,Hernán Porras Molina,50736.0,9548.0,3746.0,496.0,3771762.0,False,False,"Miami, FL",Director de http://Entornointeligente.com y CE...,1.0,1.0
1166142,120421476,2010-03-06 11:22:06+00:00,kakusan_RT,拡散RT,20825.0,1.0,23.0,514.0,3899871.0,False,False,日本,「拡散」が含まれるツイートをRTするbotです。お問い合わせは本アカウントの固定ツイートまた...,1.0,1.0
1166143,109260511,2010-01-28 13:32:03+00:00,ja8yum,Kinn-san,7318.0,5096.0,871.0,234.0,3941873.0,False,False,北海道北見市北見,"重度の化学物質(Na,Cl)アレルギー症発症で、2006年4月30日に29年間のサラリーマン...",1.0,1.0
1166144,124172948,2010-03-18 14:02:00+00:00,la_patilla,La Patilla,7063943.0,173.0,596.0,15393.0,5747076.0,True,False,Venezuela,Información e Investigación. \nDESCARGA NUESTR...,1.0,1.0


## Auxiliary data and Settings

In [6]:
# Set credibility threshold
CRED_THR = 45.0

In [7]:
# Find all users that posted or reposted within observation period
# NOTE: to use only users that perform an action remove retweeted_user_id from concatenation
known_user = pd.concat([observation_df.user_id,
                        #observation_df.retweeted_user_id.dropna()
                       ]).unique()
# Create a new df with all known users within observation period
known_user_df = pd.DataFrame(known_user, columns=['author_id'])

In [8]:
# Show preview
known_user_df.head()

Unnamed: 0,author_id
0,497188910
1,924336025387913221
2,47148805
3,1022891525242593280
4,908206586


### FIB-index

In [9]:
# Format the data to feed the model
model_input_df = observation_df[['tweet_id', 'user_id', 'retweeted_status_id', 'created_at', 'newsguard_rating']].copy()
model_input_df.newsguard_rating = (model_input_df.newsguard_rating <= CRED_THR).astype(int)
model_input_df.retweeted_status_id.fillna('ORIGIN', inplace=True)

In [10]:
# Show preview
model_input_df

Unnamed: 0,tweet_id,user_id,retweeted_status_id,created_at,newsguard_rating
0,1340468299025551360,497188910,1340325850378592257,2020-12-20 01:25:21+00:00,0
1,1340468728534884354,924336025387913221,ORIGIN,2020-12-20 01:27:04+00:00,0
2,1340473042129080320,47148805,ORIGIN,2020-12-20 01:44:12+00:00,0
3,1340474125656190978,1022891525242593280,ORIGIN,2020-12-20 01:48:31+00:00,1
4,1340477947627581440,908206586,ORIGIN,2020-12-20 02:03:42+00:00,0
...,...,...,...,...,...
164525,1366174375318274048,2647427506,1366062410277208065,2021-02-28 23:52:08+00:00,0
164526,1366174768962162691,268452474,ORIGIN,2021-02-28 23:53:41+00:00,0
164527,1366175279174086659,1262358617803587585,ORIGIN,2021-02-28 23:55:43+00:00,0
164528,1366175279635390466,1262358617803587585,ORIGIN,2021-02-28 23:55:43+00:00,0


In [11]:
# Data for the model
data = list(model_input_df.itertuples(index=False))

In [12]:
# Init the model
model = HModel()
# Train the model
model.fit(data,
          content_key=lambda x: x[0],
          author_key=lambda x: x[1],
          root_content_key=lambda x: x[2],
          timestamp_key=lambda x: x[3],
          misinf_key=lambda x: x[4])

In [13]:
# Manually select FIB rank
model.set_weights([1, 0, 0, 0, 0, 0, 0, 0, 0])
# Get the ranked values
fib_rank = model.get_rank()

In [14]:
# Format and sort the rank
fib_rank_df = pd.DataFrame(list(fib_rank.items()), columns=['author_id', 'FIB-i'])
fib_rank_df.set_index('author_id', inplace=True)
fib_rank_df = fib_rank_df.sort_values(by='FIB-i', ascending=False).astype(int)

In [15]:
# Show preview
fib_rank_df.head()

Unnamed: 0_level_0,FIB-i
author_id,Unnamed: 1_level_1
1683455144,34
910827588,14
1032615842,12
4758512368,12
245969509,11


### TeFIB (Time-aware FIB) index

In [16]:
# Init and train the model
model = HModel()

model.time_fit(data, content_key=lambda x: x[0],
               author_key=lambda x: x[1],
               root_content_key=lambda x: x[2],
               timestamp_key=lambda x: x[3],
               misinf_key=lambda x: x[4])

In [17]:
# Manually select TeFIB rank
model.set_weights([1, 0, 0, 0, 0, 0, 0, 0, 0])
# Get the ranked values
tefib_rank = model.get_time_rank()

In [18]:
# Format and sort the rank
tefib_rank_df = pd.DataFrame(list(tefib_rank.items()), columns=['author_id', 'TeFIB-i'])
tefib_rank_df.set_index('author_id', inplace=True)
tefib_rank_df = tefib_rank_df.sort_values(by='TeFIB-i', ascending=False).astype(int)

In [19]:
# Show preview
tefib_rank_df.head()

Unnamed: 0_level_0,TeFIB-i
author_id,Unnamed: 1_level_1
1683455144,60
4758512368,21
1248216384577953792,19
245969509,19
1032615842,14


## Compound rank

In [20]:
import numpy as np
# Optimized via hill-climbing
model.set_weights([-2.6113076 , -5.63461515, -0.03953497, -0.79363539, -0.87632238, -0.43171447, -2.16989018,  1.26369219,  1.99328671])
#model.set_weights(np.random.random(9))
comp_rank = model.get_rank()

In [21]:
# Format and sort the rank
comp_rank_df = pd.DataFrame(list(comp_rank.items()), columns=['author_id', 'Compound'])
comp_rank_df.set_index('author_id', inplace=True)
comp_rank_df = comp_rank_df.sort_values(by='Compound', ascending=False)

In [22]:
comp_rank_df

Unnamed: 0_level_0,Compound
author_id,Unnamed: 1_level_1
1683455144,60.594628
4758512368,21.863102
1248216384577953792,19.890819
245969509,19.743902
1032615842,14.972991
...,...
1049949019429134336,0.000000
1088677627,0.000000
2343234367,0.000000
9437162,0.000000


## Popularity rank

In [23]:
# Find the number of followers per each user_id
# NOTE: Seems like we have duplicated users in the dataset. Don't know why.
nbr_followers_df = user_data_df[['user_id', 'nbr_followers']].drop_duplicates().set_index('user_id')
# Locate each user using the user_id as index and report the number of followers.
# NOTE: This can be done beacause known users is a subset of all user in user metadata set.
popularity_rank_df = nbr_followers_df.loc[known_user]
# Sort the result
popularity_rank_df.sort_values(by='nbr_followers', ascending=False, inplace=True)
# Renaming column and index
popularity_rank_df.rename(columns={'nbr_followers': 'Popularity'}, inplace=True)
popularity_rank_df.index.rename('author_id', inplace=True)
# Cast data type to integer
popularity_rank_df.Popularity = popularity_rank_df.Popularity.astype(int)

In [24]:
print("ciao")


ciao


In [25]:
print("ciao!")

ciao!


In [26]:
popularity_rank_df

Unnamed: 0_level_0,Popularity
author_id,Unnamed: 1_level_1
24744541,9407677
5893702,3303533
18935802,3191161
395218906,2422418
19067940,2414899
...,...
1341391235068661763,0
1052556445143707651,0
1119319448144502786,0
1248325470078275584,0


## Influence rank

In [27]:
# Group by retweeted_user_id and count groups size. Keep only tweet_id column for counts.
influence_rank_df = observation_df.groupby('retweeted_user_id').count()[['tweet_id']]
# Rename columns and index
influence_rank_df.rename(columns={'tweet_id': 'Influence'}, inplace=True)
influence_rank_df.index.rename('author_id', inplace=True)
# Merge will keep also user that has never been retweeted
influence_rank_df = influence_rank_df.merge(known_user_df, how='right', on='author_id').fillna(0)
# Final sorting
influence_rank_df = influence_rank_df.sort_values(by='Influence', ascending=False).set_index('author_id')
# Type conversion
influence_rank_df = influence_rank_df.astype(int)

In [28]:
influence_rank_df

Unnamed: 0_level_0,Influence
author_id,Unnamed: 1_level_1
1683455144,3633
331617619,3219
1063806444380798976,2080
25676606,1900
910827588,1408
...,...
311926149,0
514552306,0
1069275696,0
396448393,0


### Retweeter rank

In [29]:
# Group by user_id and count the groups size.
retweeter_rank_df = observation_df.dropna().groupby('user_id').count()[["tweet_id"]]
# Rename column and index
retweeter_rank_df.rename(columns={'tweet_id': 'Retweets'}, inplace=True)
retweeter_rank_df.index.rename('author_id', inplace=True)
# Merge will keep also user that has never retweeted anyone
retweeter_rank_df = retweeter_rank_df.merge(known_user_df, how='right', on='author_id').fillna(0)
# Sort the values by retweets done
retweeter_rank_df = retweeter_rank_df.sort_values(by='Retweets', ascending=False).set_index('author_id')
# Type conversion
retweeter_rank_df.Retweets = retweeter_rank_df.Retweets.astype(int)

In [30]:
retweeter_rank_df

Unnamed: 0_level_0,Retweets
author_id,Unnamed: 1_level_1
1084450777748459520,568
555300363,275
1249068072360697862,258
3308422191,222
424084271,184
...,...
129316368,0
704094586528870400,0
459335145,0
277609784,0


### Fraction Rank

In [31]:
# Load the observation retweet network
dtype = {'source': str,
         'target': str,
         'weight': int}

observation_net_df = pd.read_csv('data/observation_retweet_network.csv', dtype=dtype)

In [32]:
observation_net_df

Unnamed: 0,source,target,weight
0,3318549610,1249068072360697862,99
1,1683455144,1299087773601861632,79
2,1683455144,424084271,69
3,1683455144,1341063070983008261,60
4,1683455144,555300363,48
...,...,...,...
13477,1322629376421355521,1219944530,1
13478,1322629376421355521,1220637330,1
13479,1322629376421355521,1223007450179215360,1
13480,1322629376421355521,1223200116670115850,1


In [34]:
# Group by source node and aggregate groups by summing the weights (total retweeted misinformation)
fraction_rank_df = observation_net_df.groupby('source')[['weight']].sum()
# Convert sum in misinformation fraction over the whole network
fraction_rank_df['Fraction'] = fraction_rank_df['weight'] / observation_net_df.weight.sum()
# Rename index to match the previous format
fraction_rank_df.index.rename('author_id', inplace=True)
# Merge will keep also user that has never been retweeted
fraction_rank_df = fraction_rank_df.merge(known_user_df, how='right', on='author_id').fillna(0)
fraction_rank_df.sort_values(by='Fraction', inplace=True, ascending=False)
#fraction_rank_df.set_index('author_id', inplace=True)
fraction_rank_df = fraction_rank_df.drop('weight', axis=1).set_index('author_id')

In [35]:
fraction_rank_df

Unnamed: 0_level_0,Fraction
author_id,Unnamed: 1_level_1
1683455144,0.196921
910827588,0.076318
1248216384577953792,0.054854
1032615842,0.027806
1322629376421355521,0.027806
...,...
926526080357470219,0.000000
1062348207525711872,0.000000
595750802,0.000000
478591778,0.000000


### Saving the ranks

In [36]:
popularity_rank_df.to_csv('data/popularity_rank.csv')
influence_rank_df.to_csv('data/influence_rank.csv')
fib_rank_df.to_csv('data/fib_rank.csv')
tefib_rank_df.to_csv('data/tefib_rank.csv')
comp_rank_df.to_csv('data/compound_rank.csv')
retweeter_rank_df.to_csv('data/retweets_rank.csv')
fraction_rank_df.to_csv('data/fraction_rank.csv')

In [37]:
# Reading test
pd.read_csv('data/popularity_rank.csv', index_col='author_id')

Unnamed: 0_level_0,Popularity
author_id,Unnamed: 1_level_1
24744541,9407677
5893702,3303533
18935802,3191161
395218906,2422418
19067940,2414899
...,...
1341391235068661763,0
1052556445143707651,0
1119319448144502786,0
1248325470078275584,0
