# Accounts ranking

In [1]:
import pandas as pd
import numpy as np
from model import HModel
import datetime
from datetime import timedelta

## Observation data (Training Set)

In [2]:
# Load retweets (observation period)
dtype = {'tweet_id': str,
         'user_id': str,
         'retweeted_user_id': str,
         'retweeted_status_id': str}

observation_df = pd.read_csv("data/observation_retweets.csv", parse_dates=[1], dtype=dtype)

In [3]:
# Show preview
observation_df

Unnamed: 0,tweet_id,created_at,user_id,retweeted_user_id,retweeted_status_id,retweet_count,likes_count,retweeted_text,root_domains,newsguard_rating
0,1340468299025551360,2020-12-20 01:25:21+00:00,497188910,1017807360075665408,1340325850378592257,4,0,@GagliardoneS @amnesia96225614 https://t.co/L3...,affaritaliani.it,64.5
1,1340468728534884354,2020-12-20 01:27:04+00:00,924336025387913221,,,0,0,,lastampa.it,95.0
2,1340473042129080320,2020-12-20 01:44:12+00:00,47148805,,,0,0,,nytimes.com,100.0
3,1340474125656190978,2020-12-20 01:48:31+00:00,1022891525242593280,,,1,3,,imolaoggi.it,5.0
4,1340477947627581440,2020-12-20 02:03:42+00:00,908206586,,,0,0,,repubblica.it tweetedtimes.com,95.0
...,...,...,...,...,...,...,...,...,...,...
164525,1366174375318274048,2021-02-28 23:52:08+00:00,2647427506,454423746,1366062410277208065,0,0,"Il ""percorso preferenziale"" per i giornalisti ...",gonews.it,95.0
164526,1366174768962162691,2021-02-28 23:53:41+00:00,268452474,,,0,0,,ilmessaggero.it,95.0
164527,1366175279174086659,2021-02-28 23:55:43+00:00,1262358617803587585,,,0,0,,informazione.it,70.0
164528,1366175279635390466,2021-02-28 23:55:43+00:00,1262358617803587585,,,0,0,,informazione.it,70.0


## Users metadata

In [4]:
# Load user metadata (used for Popularity metric)
# NOTE: Dtype are specified due a warning about 'mixed data'
dtype = {'user_id': str,
         'user_screen_name': str,
         'name': str,
         'verified': str,
         'protected': str,
         'location': str,
         'description': str}

user_data_df = pd.read_csv("data/final_users_new.csv", parse_dates=[1], dtype=dtype)

In [5]:
# Show preview
user_data_df

Unnamed: 0,user_id,created_at,user_screen_name,name,nbr_followers,nbr_followings,nbr_liked_tweets,nbr_lists,nbr_tweets,verified,protected,location,description,nbr_accounts,nbr_usernames
0,1338507058551943171,2020-12-14 15:32:19+00:00,natAfrica1,Nature Africa,325.0,0.0,0.0,1.0,0.0,False,False,,"Bringing you the latest science news, features...",1.0,1.0
1,380749300,2011-09-27 05:02:03+00:00,Apple,Apple,5821968.0,0.0,0.0,10925.0,0.0,True,False,"Cupertino, CA",http://Apple.com,1.0,1.0
2,1179147907188043776,2019-10-01 21:35:50+00:00,david25342,david2534,0.0,11.0,2.0,0.0,1.0,False,False,,,1.0,1.0
3,1340056522597019648,2020-12-18 22:09:17+00:00,AnnaLauraMasci1,Anna Laura Masciopinto,1.0,10.0,0.0,0.0,1.0,False,False,,,1.0,1.0
4,1408886954566037506,2021-06-26 20:37:08+00:00,NicolaG86861424,Nicola Gentile,0.0,5.0,0.0,0.0,1.0,False,False,,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166141,139103913,2010-05-01 15:11:38+00:00,HernanPorrasM,Hernán Porras Molina,50736.0,9548.0,3746.0,496.0,3771762.0,False,False,"Miami, FL",Director de http://Entornointeligente.com y CE...,1.0,1.0
1166142,120421476,2010-03-06 11:22:06+00:00,kakusan_RT,拡散RT,20825.0,1.0,23.0,514.0,3899871.0,False,False,日本,「拡散」が含まれるツイートをRTするbotです。お問い合わせは本アカウントの固定ツイートまた...,1.0,1.0
1166143,109260511,2010-01-28 13:32:03+00:00,ja8yum,Kinn-san,7318.0,5096.0,871.0,234.0,3941873.0,False,False,北海道北見市北見,"重度の化学物質(Na,Cl)アレルギー症発症で、2006年4月30日に29年間のサラリーマン...",1.0,1.0
1166144,124172948,2010-03-18 14:02:00+00:00,la_patilla,La Patilla,7063943.0,173.0,596.0,15393.0,5747076.0,True,False,Venezuela,Información e Investigación. \nDESCARGA NUESTR...,1.0,1.0


## Auxiliary data and Settings

In [6]:
# Set credibility threshold
CRED_THR = 45.0

In [7]:
# Filter user by score (keep only the ones with low credibility)
misinf_observation_df = observation_df[observation_df.newsguard_rating <= CRED_THR]

In [8]:
# Find all users that posted or reposted within observation period
# NOTE: to use only users that perform an action remove retweeted_user_id from concatenation
# here we use as known users only the ones who post at least one tweet.

#known_users = pd.concat([observation_df[observation_df.retweeted_user_id.isna()].user_id,
#                         observation_df.retweeted_user_id.dropna(),
#                         observation_df.user_id,
#                        ]).unique()

known_users = pd.concat([observation_df.user_id,
                         observation_df.retweeted_user_id.dropna(),
                        ]).unique()

# Create a new df with all known users within observation period
known_users_df = pd.DataFrame(known_users, columns=['author_id'])

In [9]:
known_users_df

Unnamed: 0,author_id
0,497188910
1,924336025387913221
2,47148805
3,1022891525242593280
4,908206586
...,...
29969,1139470271331676160
29970,774249195456831488
29971,255931504
29972,887706333587402752


In [10]:
# Load the evaluation retweet network (to get the optimal rank ground truth)
dtype = {'source': str,
         'target': str,
         'weight': int}

evaluation_net_df = pd.read_csv('data/evaluation_retweet_network.csv', dtype=dtype)

### FIB-index

In [11]:
# Format the data to feed the model
model_input_df = observation_df[['tweet_id', 'user_id', 'retweeted_status_id', 'created_at', 'newsguard_rating']].copy()
model_input_df.newsguard_rating = (model_input_df.newsguard_rating <= CRED_THR).astype(int)
model_input_df.retweeted_status_id.fillna('ORIGIN', inplace=True)

In [12]:
# Show preview
model_input_df

Unnamed: 0,tweet_id,user_id,retweeted_status_id,created_at,newsguard_rating
0,1340468299025551360,497188910,1340325850378592257,2020-12-20 01:25:21+00:00,0
1,1340468728534884354,924336025387913221,ORIGIN,2020-12-20 01:27:04+00:00,0
2,1340473042129080320,47148805,ORIGIN,2020-12-20 01:44:12+00:00,0
3,1340474125656190978,1022891525242593280,ORIGIN,2020-12-20 01:48:31+00:00,1
4,1340477947627581440,908206586,ORIGIN,2020-12-20 02:03:42+00:00,0
...,...,...,...,...,...
164525,1366174375318274048,2647427506,1366062410277208065,2021-02-28 23:52:08+00:00,0
164526,1366174768962162691,268452474,ORIGIN,2021-02-28 23:53:41+00:00,0
164527,1366175279174086659,1262358617803587585,ORIGIN,2021-02-28 23:55:43+00:00,0
164528,1366175279635390466,1262358617803587585,ORIGIN,2021-02-28 23:55:43+00:00,0


In [13]:
model_input_df.created_at

0        2020-12-20 01:25:21+00:00
1        2020-12-20 01:27:04+00:00
2        2020-12-20 01:44:12+00:00
3        2020-12-20 01:48:31+00:00
4        2020-12-20 02:03:42+00:00
                    ...           
164525   2021-02-28 23:52:08+00:00
164526   2021-02-28 23:53:41+00:00
164527   2021-02-28 23:55:43+00:00
164528   2021-02-28 23:55:43+00:00
164529   2021-02-28 23:55:43+00:00
Name: created_at, Length: 164530, dtype: datetime64[ns, UTC]

In [14]:
# Data for the model
data = list(model_input_df.itertuples(index=False))

In [15]:
# Init the model
fib_model = HModel(content_key=lambda x: x[0],
                   author_key=lambda x: x[1],
                   root_content_key=lambda x: x[2],
                   timestamp_key=lambda x: x[3],
                   flag_key=lambda x: x[4])

# Train the model
fib_model.fit(data)

In [16]:
fib_model.author2coauthors

{'54226483': {'125131319',
  '131847468',
  '1346120746733629440',
  '2427311741',
  '2870399433',
  '778946778',
  '829254408760389633',
  '999645834'},
 '245969509': {'1006057344042766336',
  '1013117294',
  '1037692622',
  '1073592788040777730',
  '1088465527721345029',
  '1092667291383332867',
  '1098259454024433664',
  '1106834588377198592',
  '1113538197793980419',
  '1140668272867708929',
  '1150751883566493696',
  '1163185103784337413',
  '1180638921946025985',
  '1200387273453457408',
  '1222215704042254336',
  '1231559957419560961',
  '1243299289062932482',
  '1248521058778583041',
  '1249068072360697862',
  '1250443323728429057',
  '1258828655591075845',
  '1273298206357585923',
  '1277322830',
  '1299087773601861632',
  '1319516004662247429',
  '1329544378243747840',
  '1339525654820339714',
  '1342057381363068929',
  '1344031413742538753',
  '1349068942132830212',
  '1350529232754122753',
  '137643354',
  '1529919752',
  '1560154380',
  '2149665647',
  '2195968707',
  '222

In [17]:
# Get the ranked values
fib_rank = fib_model.get_rank()

In [18]:
# Format and sort the rank
fib_rank_df = pd.DataFrame(list(fib_rank.items()), columns=['author_id', 'FIB-i'])
fib_rank_df.set_index('author_id', inplace=True)
fib_rank_df = fib_rank_df.sort_values(by='FIB-i', ascending=False).astype(int)

In [19]:
sorted(fib_model.author2features.items(), key=lambda x: x[1]['FIB-index'], reverse=True)

[('1683455144',
  {'FIB-index': 34,
   'anti-FIB-index': 0,
   'flagged-influence': 3589,
   'non-flagged-influence': 0,
   'flagged-count': 309,
   'non-flagged-count': 2,
   'self-resharing': 38,
   'fall-index': 67150,
   'co-authors': 105}),
 ('910827588',
  {'FIB-index': 14,
   'anti-FIB-index': 0,
   'flagged-influence': 1408,
   'non-flagged-influence': 0,
   'flagged-count': 25,
   'non-flagged-count': 0,
   'self-resharing': 0,
   'fall-index': 19474,
   'co-authors': 13}),
 ('1032615842',
  {'FIB-index': 12,
   'anti-FIB-index': 11,
   'flagged-influence': 498,
   'non-flagged-influence': 692,
   'flagged-count': 34,
   'non-flagged-count': 26,
   'self-resharing': 22,
   'fall-index': 5040,
   'co-authors': 29}),
 ('4758512368',
  {'FIB-index': 12,
   'anti-FIB-index': 0,
   'flagged-influence': 430,
   'non-flagged-influence': 0,
   'flagged-count': 59,
   'non-flagged-count': 0,
   'self-resharing': 0,
   'fall-index': 2484,
   'co-authors': 46}),
 ('245969509',
  {'FIB-in

In [20]:
# Show preview
fib_rank_df

Unnamed: 0_level_0,FIB-i
author_id,Unnamed: 1_level_1
1683455144,34
910827588,14
1032615842,12
4758512368,12
245969509,11
...,...
1049949019429134336,0
1088677627,0
2343234367,0
9437162,0


### Resizing Known Users

In [21]:
# FIB model generate a variable number of nodes for each settings
# Each ranking must be of same lenght to be plotted correctly
known_users = fib_rank_df.index.values # Overwrite

In [22]:
# Create a new df with all known users within observation period
known_users_df = pd.DataFrame(known_users, columns=['author_id'])

### TFIB (Time-Aware FIB) index

In [23]:
# Init and train the model
tfib_model = HModel(content_key=lambda x: x[0],
                    author_key=lambda x: x[1],
                    root_content_key=lambda x: x[2],
                    timestamp_key=lambda x: x[3],
                    flag_key=lambda x: x[4])

tfib_model.time_fit(data)

In [24]:
# Get the ranked values
tfib_rank = tfib_model.get_rank()

In [25]:
# Format and sort the rank
tfib_rank_df = pd.DataFrame(list(tfib_rank.items()), columns=['author_id', 'TeFIB-i'])
tfib_rank_df.set_index('author_id', inplace=True)

In [26]:
sorted(tfib_model.author2features.items(), key=lambda x: x[1]['FIB-index'], reverse=True)

[('1683455144',
  {'FIB-index': 6.9023933150442645,
   'anti-FIB-index': 0.0,
   'flagged-influence': 221.6874748303867,
   'non-flagged-influence': 0.0,
   'flagged-count': 209.59013882163754,
   'non-flagged-count': 1.4979249606694793,
   'self-resharing': 2.3024748247951266,
   'fall-index': 1517.8102063550293,
   'co-authors': 72.06856378419252}),
 ('4758512368',
  {'FIB-index': 2.5407469788383423,
   'anti-FIB-index': 0.0,
   'flagged-influence': 24.098901446678838,
   'non-flagged-influence': 0.0,
   'flagged-count': 37.528613333576004,
   'non-flagged-count': 0.0,
   'self-resharing': 0.0,
   'fall-index': 58.195057421479305,
   'co-authors': 30.38782909450174}),
 ('1248216384577953792',
  {'FIB-index': 2.5208910304063465,
   'anti-FIB-index': 0.40836969265865264,
   'flagged-influence': 41.60862291532999,
   'non-flagged-influence': 2.3925295571214065,
   'flagged-count': 336.1063191717726,
   'non-flagged-count': 90.44879024748775,
   'self-resharing': 0.0,
   'fall-index': 11

In [27]:
# Show preview
tfib_rank_df

Unnamed: 0_level_0,TeFIB-i
author_id,Unnamed: 1_level_1
1683455144,6.902393
4758512368,2.540747
1248216384577953792,2.520891
245969509,2.148832
500882938,1.830908
...,...
48053946,0.000000
2238886936,0.000000
6495812,0.000000
492087627,0.000000


In [28]:
#list(tfib_model.author2contents.values())[:10]

In [29]:
tfib_rank_df.index.values

array(['1683455144', '4758512368', '1248216384577953792', ..., '6495812',
       '492087627', '1353290654084980736'], dtype=object)

In [30]:
fib_rank_df.index.values

array(['1683455144', '910827588', '1032615842', ..., '2343234367',
       '9437162', '1353290654084980736'], dtype=object)

In [31]:
set_xor = np.setxor1d(tfib_rank_df.index.values, fib_rank_df.index.values)

In [32]:
set_xor

array([], dtype=object)

In [33]:
len(set_xor)

0

In [34]:
for x in set_xor:
    print(model_input_df.loc[model_input_df['user_id'] == x])

## Compound rank

In [35]:
# Optimized via hill-climbing

tfib_model.set_weights(
    [4.22553889550027,
     -4.418534776720375,
     -7.087107655563367,
     3.5781761997485666,
     -0.3023988995924071,
     1.1364196973328102,
     -4.910813702015649,
     6.332629858057944,
     -1.477019018312982,
     3.5980313856386137])

#tfib_model.set_weights([0,0,0,0,0,0,0,0,1,0])

comp_rank = tfib_model.get_rank(normalize=False)

In [36]:
# Format and sort the rank
comp_rank_df = pd.DataFrame(list(comp_rank.items()), columns=['author_id', 'Compound'])
comp_rank_df.set_index('author_id', inplace=True)
comp_rank_df = comp_rank_df.sort_values(by='Compound', ascending=False)

In [37]:
comp_rank_df

Unnamed: 0_level_0,Compound
author_id,Unnamed: 1_level_1
1683455144,7893.940249
317742806,6595.386236
317671787,5395.293218
317685710,1227.203603
1262358617803587585,1053.878240
...,...
275754256,-160.576584
1029307293928771584,-160.793781
1108513736250310657,-184.503816
790817979344621568,-333.995804


## Popularity rank

In [38]:
# Find the number of followers per each user_id
# NOTE: Seems like we have duplicated users in the dataset. Don't know why.
nbr_followers_df = user_data_df[['user_id', 'nbr_followers']].drop_duplicates().set_index('user_id')
# Locate each user using the user_id as index and report the number of followers.
# NOTE: This can be done beacause known users is a subset of all user in user metadata set.
popularity_rank_df = nbr_followers_df.loc[known_users]
# Renaming column and index
popularity_rank_df.rename(columns={'nbr_followers': 'Popularity'}, inplace=True)
popularity_rank_df.index.rename('author_id', inplace=True)
# Merge will keep also user that has never been retweeted
#popularity_rank_df = popularity_rank_df.merge(known_users_df, how='right', on='author_id').fillna(0)
# Final sorting
popularity_rank_df = popularity_rank_df.sort_values(by='Popularity', ascending=False)#.set_index('author_id')
# Cast data type to integer
popularity_rank_df.Popularity = popularity_rank_df.Popularity.astype(int)

In [39]:
popularity_rank_df

Unnamed: 0_level_0,Popularity
author_id,Unnamed: 1_level_1
24744541,9407677
5893702,3303533
18935802,3191161
395218906,2422418
19067940,2414899
...,...
1259879509974794242,0
1343971007594426369,0
1319739235252011009,0
1343983392937959427,0


## Influence rank

In [40]:
# Group by retweeted_user_id and count groups size. Keep only tweet_id column for counts.
influence_rank_df = misinf_observation_df.groupby('retweeted_user_id').count()[['tweet_id']]
# Rename columns and index
influence_rank_df.rename(columns={'tweet_id': 'Influence'}, inplace=True)
influence_rank_df.index.rename('author_id', inplace=True)
# Merge will keep also user that has never been retweeted
influence_rank_df = influence_rank_df.merge(known_users_df, how='right', on='author_id').fillna(0)
# Final sorting
influence_rank_df = influence_rank_df.sort_values(by='Influence', ascending=False).set_index('author_id')
# Type conversion
influence_rank_df = influence_rank_df.astype(int)

In [41]:
influence_rank_df

Unnamed: 0_level_0,Influence
author_id,Unnamed: 1_level_1
1683455144,3633
910827588,1408
1248216384577953792,1012
1032615842,513
1322629376421355521,513
...,...
1229048607459958787,0
999261989645049856,0
1344646726293053440,0
1309835255399034882,0


### Retweeter rank

In [42]:
# Group by user_id and count the groups size.
retweeter_rank_df = misinf_observation_df.dropna().groupby('user_id').count()[["tweet_id"]]
# Rename column and index
retweeter_rank_df.rename(columns={'tweet_id': 'Retweets'}, inplace=True)
retweeter_rank_df.index.rename('author_id', inplace=True)
# Merge will keep also user that has never retweeted anyone
retweeter_rank_df = retweeter_rank_df.merge(known_users_df, how='right', on='author_id').fillna(0)
# Sort the values by retweets done
retweeter_rank_df = retweeter_rank_df.sort_values(by='Retweets', ascending=False).set_index('author_id')
# Type conversion
retweeter_rank_df.Retweets = retweeter_rank_df.Retweets.astype(int)

In [43]:
retweeter_rank_df

Unnamed: 0_level_0,Retweets
author_id,Unnamed: 1_level_1
555300363,122
1341063070983008261,112
424084271,105
1158758131448590336,73
97948686,72
...,...
461278706,0
708587838,0
505705375,0
313813519,0


## Ground truth

In [44]:
optimal_rank_df = evaluation_net_df[['source', 'weight']].groupby('source').sum()
optimal_rank_df.rename(columns={'weight': 'Optimal'}, inplace=True)
optimal_rank_df.index.rename('author_id', inplace=True)

optimal_rank_df = optimal_rank_df.merge(known_users_df, how='right', on='author_id').fillna(0).astype(int)
optimal_rank_df.sort_values(by='Optimal', ascending=False, inplace=True)
optimal_rank_df.set_index('author_id', inplace=True)
optimal_rank_df.index = optimal_rank_df.index.astype(str)

In [45]:
optimal_rank_df

Unnamed: 0_level_0,Optimal
author_id,Unnamed: 1_level_1
1683455144,25704
4758512368,13638
1248216384577953792,10135
245969509,6918
1032615842,3633
...,...
256897312,0
756728713710927873,0
3037822119,0
981903244640313345,0


### Saving the ranks

In [46]:
popularity_rank_df.to_csv('data/popularity_rank.csv')
influence_rank_df.to_csv('data/influence_rank.csv')
fib_rank_df.to_csv('data/fib_rank.csv')
tfib_rank_df.to_csv('data/tfib_rank.csv')
retweeter_rank_df.to_csv('data/retweets_rank.csv')
comp_rank_df.to_csv('data/compound_rank.csv')
optimal_rank_df.to_csv('data/optimal_rank.csv')

In [47]:
# Reading test
pd.read_csv('data/compound_rank.csv', index_col='author_id')

Unnamed: 0_level_0,Compound
author_id,Unnamed: 1_level_1
1683455144,7893.940249
317742806,6595.386236
317671787,5395.293218
317685710,1227.203603
1262358617803587585,1053.878240
...,...
275754256,-160.576584
1029307293928771584,-160.793781
1108513736250310657,-184.503816
790817979344621568,-333.995804
