#### Network construction

In [None]:
import numpy as np
import igraph as ig
import seaborn as sns
import pandas as pd
import matplotlib.pylab as plt
import scipy.stats as st
import pickle

sns.set()

In [None]:
data = pd.read_csv('data/csvs/table_retweets.csv', low_memory = False)

In [None]:
#copy without NaN value -> kept only retweets
data_retweet = data[data['retweeted_status.user.id'].isnull() == False].copy()

In [None]:
#adding columns for date, weight and a day counter
data_retweet['date'] = data_retweet['created_at'].apply(lambda x: x[5:10])
data_retweet['weight'] = 1
data_retweet['day_count'] = data_retweet.groupby('date').ngroup().values

#####  retweets distribution

In [None]:
count_per_day = data_retweet.groupby('date',as_index=False).count()

plt.xlabel('date')
plt.ylabel('number of tweets')
plt.title('Tweets per Day')
plt.xticks(count_per_day.index,count_per_day.date, rotation= 'vertical')
plt.bar(count_per_day.index,count_per_day.id_str)
plt.savefig('measures/pics/retweet_per_day.png')
plt.show()

In [None]:
#fuction to generate graph of a selected period
def build_graph(data_retweet, fini_day,init_day = 0):
    sub_set = data_retweet[(data_retweet.day_count >= init_day)&(data_retweet.day_count < fini_day)]
    db_for_network = sub_set.groupby(['retweeted_status.user.id','user.id'], as_index=False).count()
    db_for_network = db_for_network[db_for_network.weight > 1]#.astype(int)
    return ig.Graph.TupleList(db_for_network[['retweeted_status.user.id','user.id','weight']].itertuples(index=False), directed=True, weights=True)

####  Centrality measures analysis

In [None]:
#function to compute centrality measures
def centrality_measures(graph, df, weights = None):
    # graph = igraph.Graph object
    # df = pandas.DataFrame object
    df['name'] = graph.vs['name']
    df['betweenness'] = graph.betweenness(weights=weights)
    df['pagerank'] = graph.pagerank(weights=weights)
    df['degree'] = graph.degree()
    df['outdegree'] = graph.outdegree()
    df['local_transitivity'] = graph.transitivity_local_undirected(mode = 'zero',weights=weights)
    if graph.is_simple():
        df['knn'] = graph.knn(weights=weights)[0]
    return df

#####  operations for merging the csv dataset with a new pickle file with metadata about the twits

In [None]:
#new pickle file with raw metadata
with open('data/pickles/df_metadata.pickle','rb') as f:
    df_meta=pickle.load(f)

In [None]:
#cleaning metadata, deleting NaN value, keeping only retweeted_username from the text 
with open('data/pickles/short_df.pickle','wb') as file:
    df_meta = df_meta[df_meta['retweeted_status.id'].isnull() == False]
    df_meta.text = df_meta['text'].astype(str).apply(lambda x: x.split(' ')[1])
    df_meta['text']=df_meta['text'].apply(lambda x: x.split('@')[1].split(':')[0])
    df_meta = df_meta.rename(columns = {'text':'retweeted_username'})
    pickle.dump(df_meta[['id_str','retweeted_username','retweeted_status.id','user.screen_name','user.followers_count']],file)

In [None]:
#merging inital data with these new metadata
new_df = data_retweet.merge(df_meta[['id_str','retweeted_username','user.screen_name']].astype(str), on = 'id_str')

In [None]:
#saving new database
with open('complete_df.pickle','wb') as f:
    pickle.dump(new_df,f)

#####  now I shotdown the kernel to free space on ram, then reload the ultimate dataset

In [None]:
#reload data from a pickle file
with open('data/pickles/complete_df.pickle','rb') as f:
    df = pickle.load(f)

##### 3 days 

In [None]:
g3 = build_graph_by_day(df, 4, init_day=1)
df3 = centrality_measures(g3, pd.DataFrame(), weights = 'weight')

#####  7 days

In [None]:
g7= build_graph_by_day(df, 8, init_day=1)
df7 = centrality_measures(g7, pd.DataFrame(), weights = 'weight')

#####  15 days

In [None]:
g15= build_graph_by_day(df, 16, init_day=1)
df15 = centrality_measures(g15, pd.DataFrame(), weights = 'weight')

#####  30 days

In [None]:
g30= build_graph_by_day(df, 31, init_day=1)
df30 = centrality_measures(g30, pd.DataFrame(), weights = 'weight')

In [None]:
#plotting results
name = ['df3','df7','df15','df30']
i=0
for dataframe in [df3,df7,df15,df30]:
    fig, axs = plt.subplots(2, 1, sharex=True)
    sns.regplot(data = dataframe.sort_values('outdegree',ascending=False)[:1000], x = 'outdegree', y = 'betweenness', ax = axs[0])
    sns.regplot(data = dataframe.sort_values('outdegree',ascending=False)[:1000], x = 'outdegree', y = 'knn', ax = axs[1])
    fig.savefig('measures/pics/measurestrends%s.png'%name[i])
    i+=1

In [None]:
#pagerank vs outdegree
plt.ylim((-0.001,0.003))
plt.ticklabel_format(axis='y', style='sci', scilimits=(0,0))
sns.scatterplot(data = prova, x = 'outdegree', y = 'pagerank')
plt.savefig('measures/pics/pagerank_vs_degree.png')

In [None]:
#clustering coefficient vs outdegree
sns.scatterplot(data = prova, x = 'outdegree', y = 'local_transitivity')
plt.savefig('cc_vs_degree.png')

#####  outdegree distribution

In [None]:
g_tot = build_graph_by_day(df,31)
outk = np.array(g_tot.outdegree())
outk = outk[np.where(outk>0)] #remove the zero values to not have problems to pass in log scale

In [None]:
sns.histplot(data = outk, log_scale=(True,True))
plt.title('Distribution of Out-degree')
plt.xlabel('k out')
plt.savefig('measures/pics/out_distr.png')

#####  fitting the distribution

In [None]:
from scipy import optimize

In [None]:
def powerlaw(x, m, c):
    return  x*m + c

In [None]:
xdata = np.log10(np.histogram(outk, bins= 50)[1][1:])
ydata = np.log10(np.where(np.histogram(outk, bins=50)[0]!=0,np.histogram(outk, bins=50)[0],1))

In [None]:
popt, pcov = optimize.curve_fit(powerlaw,xdata,ydata, p0 = [-3, 30])

In [None]:
plt.scatter(xdata,ydata, alpha=0.8, label = 'k distribution')
plt.plot(xdata, popt[0]*xdata + popt[1], c='r', label = 'fitting curve')
plt.xlabel('log(k)')
plt.ylabel('log(counts)')
plt.suptitle('K ditribution fitting')
plt.title('\u03B1 = %.2f'%popt[0])
plt.legend()
plt.savefig('measures/pics/fitting_distr.png')

###  Leader detection

In [None]:
#users id of to retweeted of each set
top_100_k = pd.DataFrame({'30_days' : measures_30_days.sort_values('outdegree')['name'].values[-100:],
                            '15_days' : measures_first_2_week.sort_values('outdegree')['name'].values[-100:],
                            '7_days' : measures_7_days.sort_values('outdegree')['name'].values[-100:],
                            '3_days' : measures_3_days.sort_values('outdegree')['name'].values[-100:]})

In [None]:
#count number of appearences
from collections import Counter
counter_k = Counter(np.concatenate([i for i in top_100_k.values]))

In [None]:
#results
plt.hist(list(counter_k.values()),  bins= np.linspace(0.5,4.5,5))
plt.xticks([1,2,3,4])
plt.suptitle('Number of appearances in different sets', fontsize = 18, y =1.01)
plt.title('top 100 leaders in outdegree')
plt.xlabel('num of appearances')
plt.savefig('measures/pics/outdegree_counter.png')

In [None]:
#fucntion to build a network starting from 3 random days
def build_graph_3rand(data_retweet,days_choice=None):  
    if days_choice.any()!=None:
        sub_set = data_retweet[data_retweet.day_count.isin(days_choice)]
    else:
        sub_set=data_retweet
    db_for_network = sub_set.groupby(['retweeted_status.user.id','user.id'], as_index=False).count()
    db_for_network = db_for_network[db_for_network.weight > 1]
    return ig.Graph.TupleList(db_for_network[['retweeted_status.user.id','user.id','weight',]].itertuples(index=False), directed=True, weights=True)

In [None]:
days_count = df['day_count'].unique() #variabile con i numeri dei giorni

In [None]:
random_graphs = [] # costruisco 20 network doi 3 giorni random
for i in range(20):
    random_graphs.append(build_graph_3rand(df,np.random.choice(days_count,3,replace=False)))

In [None]:
graphs_df = []  #costruisco i loro rispettivi dataframe con name e  degree 
for g in random_graphs:
    graphs_df.append(pd.DataFrame({'name' : g.vs['name'], 'outdegree' : g.outdegree()}))

In [None]:
counter = Counter(np.concatenate(graphs_leader))


plt.hist(list(counter.values()), bins= np.linspace(0.5,20.5,21))
plt.xticks(range(1,21))
plt.suptitle('number of random network in which the user appears as leaders', fontsize = 12)
plt.savefig('measures/pics/number_leaders_rand_graph.png')
plt.show()#results


###  Network evolution

In [None]:
#first half of the month
graphs_first_15  = []
for day in range(2,16):
    graphs_first_15.append(build_graph_by_day(df, day, init_day=1)) 

In [None]:
giant_connected_component_dimension = []
relative_gcc_dimension = []
for g in graphs_first_15:
    cl = g.components(mode='WEAK') #STRONG means that each pair of vertex must be reachable from each other
    giant_connected_component_dimension.append(len(cl.giant().vs)) # dimensione totale
    relative_gcc_dimension.append(len(cl.giant().vs)/len(g.vs))  # dimensione relativa

In [None]:
#second half
graphs_last_15  = []
for day in range(17,31):
    graphs_last_15.append(build_graph_by_day(df, day, init_day=16))
    
giant_connected_component_dimension_bis = []
relative_gcc_dimension_bis = []
for g in graphs_last_15:
    cl = g.components(mode='WEAK') 
    giant_connected_component_dimension_bis.append(len(cl.giant().vs))
    relative_gcc_dimension_bis.append(len(cl.giant().vs)/len(g.vs))

In [None]:
#results
fig,ax2 = plt.subplots(1,1,figsize = (8,6))
ax2.plot(relative_gcc_dimension,label='first period')
ax2.plot(relative_gcc_dimension_bis, label='last period')
ax2.set_title('relative dimension of the biggest connecetd component')
ax2.set_ylabel('n_cgg/n_tot')
ax2.set_xlabel('days of the network')
ax2.set_xticks(range(14))
ax2.set_xticklabels(range(2,17))
ax2.legend()
fig.savefig('measures/pics/size_change_gcc(days).png')

###  Cluster Ananlysis

In [None]:
g_tot.to_undirected(combine_edges='sum')  #making total graph as undirected

#now as cluster
cl_tot = g_tot.community_multilevel(weights='weight', return_levels=False)  

In [None]:
clusters = cl_tot.subgraphs()   # lista con i grafi delle singole componenti
clusters_size = [cl.vcount() for cl in clusters]  #lista con il numero di utenti di ogni componente
components_df = pd.DataFrame({'cluster' : clusters, 'size' : clusters_size})

In [None]:
sorted_component_df = components_df.sort_values('size',ascending=False)['cluster'] 
#ordinato per dimensione della community

In [None]:
#same algorithm, now done for each week
four_graphs = [build_graph_by_day(df, init_day=1+7*i, fini_day= 7*(i+1)+1) for i in range(4)]

In [None]:
as_cluster = [] # array with vertex clustering objects
for g in four_graphs:
    g.to_undirected(combine_edges='sum')
    as_cluster.append(g.community_multilevel(weights='weight', return_levels=False))

In [None]:
graphs_components= [] # for each week I save the siza of clusters
for cl in as_cluster:
    clusters = cl.subgraphs()
    clusters_lenght = [c.vcount() for c in clusters]
    graphs_components.append(pd.DataFrame({'cluster' : clusters, 'size' : clusters_lenght}))

In [None]:
#sorting database
df_sorted = [graphs_components[i].sort_values('size', ascending = False) for i in range(4)] 

In [None]:
notiziari_id = [] #lista con gli id degli utenti facenti parte la community "notiziari"
for i in leader_list: 
    if np.isin(i,sorted_component_df.values[0].vs['name']): #values[0] corrisponde alla più grande community, della rete totale
        notiziari_id.append(i)

In [None]:
notiziari_size = [] # salvo qui le dimensione del cluster notiziari nei 4 periodi
for data in df_sorted:
    for i in range(5): #cerco la comnità notiziari tra le prime 5 di ogni periodo, suppendo ci sia
        if  len(np.intersect1d(notiziari_id, data.iloc[i,0].vs['name'], assume_unique=True)) > 5 : 
            #ipotizzo che bastino 5 utenti della lista per riconoscere la comunità
            notiziari_size.append(data.iloc[i,1])

In [None]:
#same for virologists
virologi_id = []
for i in leader_list:  
    if np.isin(i,sorted_component_df.values[1].vs['name']):
        virologi_id.append(i)
virologi_size = []
for data in df_sorted:
    for i in range(5):
        if  len(np.intersect1d(virologi_id, data.iloc[i,0].vs['name'], assume_unique=True)) == 2 : #==2 perchè cerco Burioni e Cartabellotta 
            virologi_size.append(data.iloc[i,1])

In [None]:
#politicians
destra_id = []
for i in leader_list: 
    if np.isin(i,components_df.sort_values('size',ascending=False)['cluster'].values[2].vs['name']):
        destra_id.append(i)
destra_size = []
for data in df_sorted:
    for i in range(5): 
        if  len(np.intersect1d(destra_id, data.iloc[i,0].vs['name'], assume_unique=True)) > 5 : 
            destra_size.append(data.iloc[i,1])

In [None]:
#result
plt.plot(notiziari_size, label = 'news companies')
plt.plot(virologi_size, label = 'virologists')
plt.plot(destra_size, label = 'right wing')
plt.ylabel('number of verteces')
plt.xticks(range(4), ['%d° period'%i for i in range(1,5)])
plt.legend()
plt.savefig('measures/pics/community_for_each_week.png')
plt.show()

#####  the same is done but for a cumulative analysis

In [None]:
four_graphs = [build_graph_by_day(df, init_day=1, fini_day=7*(1+i)+1) for i in range(4)]

as_cluster = [] # li trasoformo tutti in vertex_clustrering objects
for g in four_graphs:
    g.to_undirected(combine_edges='sum')
    as_cluster.append(g.community_multilevel(weights='weight', return_levels=False))
    
graphs_components= [] # e ora di ogni priodo divido il grafico in clusters di ccui salvo la dimensione
for cl in as_cluster:
    clusters = cl.subgraphs()
    clusters_lenght = [c.vcount() for c in clusters]
    graphs_components.append(pd.DataFrame({'cluster' : clusters, 'size' : clusters_lenght}))

#riordine i database dei users in ordine di dimensione
df_sorted = [graphs_components[i].sort_values('size', ascending = False) for i in range(4)] 

In [None]:
notiziari_size = [] # salvo qui le dimensione del cluster notiziari nei 4 periodi
for data in df_sorted:
    for i in range(5): #cerco la comnità notiziari tra le prime 5 di ogni periodo, suppendo ci sia
        if  len(np.intersect1d(notiziari_id, data.iloc[i,0].vs['name'], assume_unique=True)) > 5 : 
            #ipotizzo che bastino 5 utenti della lista per riconoscere la comunità
            notiziari_size.append(data.iloc[i,1])
virologi_size = []
for data in df_sorted:
    for i in range(5):
        if  len(np.intersect1d(virologi_id, data.iloc[i,0].vs['name'], assume_unique=True)) == 2 : #==2 perchè cerco Burioni e Cartabellotta 
            virologi_size.append(data.iloc[i,1])
destra_size = []
for data in df_sorted:
    for i in range(5): 
        if  len(np.intersect1d(destra_id, data.iloc[i,0].vs['name'], assume_unique=True)) > 5 : 
            destra_size.append(data.iloc[i,1])

In [None]:
fig,ax = plt.subplots(1,1,figsize = (7,5))
ax.plot(notiziari_size, label = 'news companies')
ax.plot(virologi_size, label = 'virologists')
ax.plot(destra_size, label = 'right wing')
ax.set_ylabel('number of verteces')
ax.set_xticks(range(4), ['%d° period'%i for i in range(1,5)])
ax.legend()
fig.savefig('measures/pics/community_change.png')
plt.show()