# Polarization on Twitter: cross-topic analysis

In this notebook, we study behavioral classes of users interacting with both debates (COVID-19 vaccine and Ukraine conflict). 
The sample's size is 170. Associated users ids are in the *id_users_bi_thematic.csv* file. 

In [39]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook
import os
from datetime import datetime, date, timedelta
from scipy.stats import entropy
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import davies_bouldin_score
import matplotlib.cm as cm
import itertools
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from pydlc import dense_lines
import scipy.stats as stats

In [3]:
users = pd.read_csv('../data/id_users_bi_thematic.csv', index_col=0)

for u in range(len(users)):
    users.loc[u, 'id'] = 'B'+str(u)
    
dict_id_vacc = dict(zip(users['vacc'], users['id']))
dict_id_ukr = dict(zip(users['ukr'], users['id']))
users = users.set_index('id')

In [5]:
users.head()

Unnamed: 0_level_0,vacc,ukr
id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0,S522V,S26U
B1,S563V,S197U
B2,S749V,S122U
B3,S754V,S355U
B4,S996V,S73U


# Aggregate analysis

In [7]:
agg_vacc = pd.read_csv('../results/aggregate_analysis/vaccine_debate_final_data.csv', index_col=0)
agg_ukr = pd.read_csv('../results/aggregate_analysis/ukraine_conflict_debate_final_data.csv', index_col=0)

In [30]:
bi_thematic_vacc_agg = agg_vacc[agg_vacc.index.isin(users['vacc'].tolist())]
bi_thematic_ukr_agg = agg_ukr[agg_ukr.index.isin(users['ukr'].tolist())]

In [31]:
bi_thematic_vacc_agg.index = bi_thematic_vacc_agg.index.map(dict_id_vacc)
bi_thematic_ukr_agg.index = bi_thematic_ukr_agg.index.map(dict_id_ukr)

In [32]:
users_list = users.index.tolist()

In [33]:
bi_thematic_vacc_agg['cluster_name'] = None
bi_thematic_vacc_agg['community'] = None
for u in tqdm(users_list):
    if bi_thematic_vacc_agg.loc[u, 'cluster'] == 0:
        bi_thematic_vacc_agg.loc[u, 'cluster_name'] = 'pol_pro'
        bi_thematic_vacc_agg.loc[u, 'community'] = 'provax'
    elif bi_thematic_vacc_agg.loc[u, 'cluster'] == 1:
        bi_thematic_vacc_agg.loc[u, 'cluster_name'] = 'pol_anti'
        bi_thematic_vacc_agg.loc[u, 'community']= 'antivax'
    elif bi_thematic_vacc_agg.loc[u, 'cluster'] == 2:
        bi_thematic_vacc_agg.loc[u, 'cluster_name'] = 'inter_anti'
        bi_thematic_vacc_agg.loc[u, 'community'] =  'antivax'
    elif bi_thematic_vacc_agg.loc[u, 'cluster'] == 3:
        bi_thematic_vacc_agg.loc[u, 'cluster_name'] = 'inter_pro'
        bi_thematic_vacc_agg.loc[u, 'community']= 'provax'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bi_thematic_vacc_agg['cluster_name'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bi_thematic_vacc_agg['community'] = None
100%|██████████| 170/170 [00:00<00:00, 6882.41it/s]


In [34]:
bi_thematic_vacc_agg

Unnamed: 0,x_term,x_term_trans,y_term_provax,y_term_antivax,x_final,y_pro_final,y_anti_final,cluster,cluster_name,community
B40,0.963725,0.981862,0.180960,1.000000,0.880348,0.319748,1.000000,3,inter_pro,provax
B28,1.000000,1.000000,0.413996,0.000000,1.000000,0.456675,0.000000,0,pol_pro,provax
B108,1.000000,1.000000,0.196095,0.000000,1.000000,0.330607,0.000000,0,pol_pro,provax
B115,1.000000,1.000000,0.236364,0.000000,1.000000,0.357470,0.000000,0,pol_pro,provax
B52,1.000000,1.000000,0.202700,0.000000,1.000000,0.335201,0.000000,0,pol_pro,provax
...,...,...,...,...,...,...,...,...,...,...
B41,-0.976017,0.011992,1.000000,0.382314,0.099236,1.000000,0.440319,2,inter_anti,antivax
B16,-1.000000,0.000000,0.000000,0.275365,0.000000,0.000000,0.381359,1,pol_anti,antivax
B12,-1.000000,0.000000,0.000000,0.377250,0.000000,0.000000,0.437671,1,pol_anti,antivax
B36,-0.973949,0.013025,1.000000,0.837561,0.103042,1.000000,0.694257,2,inter_anti,antivax


In [35]:
bi_thematic_ukr_agg['cluster_name'] = None
bi_thematic_ukr_agg['community'] = None
for u in tqdm(users_list):
    if bi_thematic_ukr_agg.loc[u, 'cluster'] == 0:
        bi_thematic_ukr_agg.loc[u, 'cluster_name'] = 'pol_proU'
        bi_thematic_ukr_agg.loc[u, 'community'] = 'proUkraine'
    elif bi_thematic_ukr_agg.loc[u, 'cluster'] == 1:
        bi_thematic_ukr_agg.loc[u, 'cluster_name'] = 'pol_proR'
        bi_thematic_ukr_agg.loc[u, 'community']= 'proRussia'
    elif bi_thematic_ukr_agg.loc[u, 'cluster'] == 2:
        bi_thematic_ukr_agg.loc[u, 'cluster_name'] = 'inter_proR'
        bi_thematic_ukr_agg.loc[u, 'community'] =  'proRussia'
    elif bi_thematic_ukr_agg.loc[u, 'cluster'] == 3:
        bi_thematic_ukr_agg.loc[u, 'cluster_name'] = 'inter_proU'
        bi_thematic_ukr_agg.loc[u, 'community']= 'proUkraine'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bi_thematic_ukr_agg['cluster_name'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bi_thematic_ukr_agg['community'] = None
100%|██████████| 170/170 [00:00<00:00, 6115.19it/s]


In [36]:
for u in tqdm(users_list):
    users.loc[u,'cluster_vacc'] = bi_thematic_vacc_agg.loc[u, 'cluster_name']
    users.loc[u, 'community_vacc'] = bi_thematic_vacc_agg.loc[u, 'community']
    users.loc[u,'cluster_ukr'] = bi_thematic_ukr_agg.loc[u, 'cluster_name']
    users.loc[u, 'community_ukr'] = bi_thematic_ukr_agg.loc[u, 'community']  

100%|██████████| 170/170 [00:00<00:00, 4442.15it/s]


In [37]:
pd.DataFrame(users.groupby(['community_vacc','community_ukr'])['vacc'].count()).rename(columns={'vacc':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,count
community_vacc,community_ukr,Unnamed: 2_level_1
antivax,proRussia,133
provax,proRussia,1
provax,proUkraine,36


In [38]:
pd.DataFrame(users.groupby(['cluster_vacc','cluster_ukr'])['vacc'].count()).rename(columns={'vacc':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,count
cluster_vacc,cluster_ukr,Unnamed: 2_level_1
inter_anti,inter_proR,29
inter_anti,pol_proR,29
inter_pro,inter_proR,1
inter_pro,inter_proU,2
inter_pro,pol_proU,2
pol_anti,inter_proR,21
pol_anti,pol_proR,54
pol_pro,inter_proU,3
pol_pro,pol_proU,29


In [40]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [42]:
print('Cramer\'s V - Community :', cramers_v(users['community_vacc'], users['community_ukr']))
print('Cramer\'s V - Cluster :', cramers_v(users['cluster_vacc'], users['cluster_ukr']))

Cramer's V - Community : 0.9650502240356451
Cramer's V - Cluster : 0.6063065650572218


# Temporal analysis

In [43]:
clusters_temp_vacc = pd.read_csv('../results/temporal_analysis/vaccine_clusters_labels.csv', index_col=0)
clusters_temp_ukr = pd.read_csv('../results/temporal_analysis/ukraine_conflict_clusters_labels.csv', index_col=0)

In [44]:
clusters_temp_vacc = clusters_temp_vacc[users['vacc'].tolist()]
clusters_temp_ukr = clusters_temp_ukr[users['ukr'].tolist()]
clusters_temp_vacc.columns = clusters_temp_vacc.columns.map(dict_id_vacc)
clusters_temp_ukr.columns = clusters_temp_ukr.columns.map(dict_id_ukr)

In [45]:
users_varr_vacc = clusters_temp_vacc[clusters_temp_vacc.nunique()[clusters_temp_vacc.nunique() >  1].index].columns.tolist()
users_varr_ukr = clusters_temp_ukr[clusters_temp_ukr.nunique()[clusters_temp_ukr.nunique() >  1].index].columns.tolist()

In [46]:
clusters_temp_vacc[users_varr_vacc]

Unnamed: 0,B1,B6,B7,B10,B15,B22,B31,B38,B41,B44,...,B142,B147,B148,B150,B151,B153,B155,B157,B162,B165
0,interAnti,interAnti,interAnti,interAnti,anti,interAnti,anti,interPro,anti,interAnti,...,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,anti,anti
1,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interPro,interAnti,interAnti,...,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,anti,anti
2,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interPro,interAnti,interAnti,...,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti,interAnti
3,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti
4,anti,anti,anti,anti,anti,anti,anti,pro,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti
5,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti
6,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti
7,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti
8,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti
9,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti,...,anti,anti,anti,anti,anti,anti,anti,anti,anti,anti


In [47]:
clusters_temp_ukr[users_varr_ukr]

Unnamed: 0,B1,B5,B7,B9,B10,B11,B12,B18,B19,B22,...,B132,B133,B134,B138,B153,B162,B163,B164,B166,B169
0,proR,proU,proR,proR,proR,proR,proR,proR,proR,proR,...,proR,proR,proR,proR,proR,proR,proR,proR,proR,proR
1,proR,proU,proR,proR,proR,proR,proR,proR,proR,proR,...,proR,proR,proR,proR,proR,proR,proR,proR,proR,proR
2,proR,inter,proR,proR,inter,proR,inter,proR,proR,proR,...,proR,inter,proR,proR,proR,proR,proR,proR,proR,proR
3,proR,inter,proR,inter,inter,proR,inter,proR,proR,proR,...,proR,inter,proR,inter,proR,proR,proR,proR,proR,proR
4,proR,interU,proR,interR,interR,proR,interR,interR,proR,proR,...,proR,interR,proR,interR,proR,proR,proR,proR,proR,proR
5,interR,interU,proR,interR,interR,interR,interR,interR,interR,proR,...,interR,interR,interR,interR,interR,interR,interR,interR,proR,interR
6,interR,interU,interR,interR,interR,interR,interR,interR,interR,interR,...,interR,interR,interR,interR,interR,interR,interR,interR,proR,interR
7,interR,interU,interR,interR,interR,interR,interR,interR,interR,interR,...,interR,interR,interR,interR,interR,interR,interR,interR,proR,interR
8,interR,interU,interR,interR,interR,interR,interR,interR,interR,interR,...,interR,interR,interR,interR,interR,interR,interR,interR,proR,interR
9,interR,interU,interR,interR,interR,interR,interR,interR,interR,interR,...,interR,interR,interR,interR,interR,interR,interR,interR,proR,interR
