# Community detection

In [51]:
!pip install python-louvain



In [30]:
from community import community_louvain
import networkx as nx
from networkx.algorithms import community
import pandas as pd
import re

In [31]:
def network_from_edges(edges):
  dict_edges = {}
  for ind in range(len(edges)):
    if edges.iloc[ind,2] > 10:
      dict_edges[(edges.iloc[ind,0], edges.iloc[ind,1])] = edges.iloc[ind,2]
  net = pd.DataFrame.from_dict(dict_edges,orient='index')
  net.columns = ["weight"]
  net.sort_values(by="weight",inplace=True, ascending=False)
  return dict_edges, net


def get_graph(network):
  up_weighted = []
  for edge in network:
      #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
      up_weighted.append((edge[0],edge[1],network[edge]))
  
  G = nx.Graph()
  G.add_weighted_edges_from(up_weighted)
  return G


def extract_text(df):       #extract the text from the tweets and RT
                            #works ONLY on .csv file
  list_strings = []
  for index in range(len(df)):
    if index % 1000 == 0:
      print(str(index)+' / '+str(len(df)))
    text = df.loc[index]['text']                          #if it is nor trucated nor a RT  i take "text"
    string = -1
    if (df.loc[index,"truncated"] == True):                 #if it is trucated I take "extended_tweet"
        string = df.loc[index,"extended_tweet"]
    if type(df.loc[index,"retweeted_status"]) != float:     #if it is a RT I take retweeted_status
        string = df.loc[index,"retweeted_status"]
    if type(string) == str :
        if(re.search('full_text\':(.+?)https',string) != None):     #if I find "full_text"
          s = re.search('full_text\':(.+?)https',string).group(1)
        if(re.search('text\':(.+?)https',string)!= None):
          s = re.search('text\':(.+?)https',string).group(1)
        else: 
          continue
        list_strings.append(s)
    else:
      list_strings.append(text)

  return list_strings


def frequency_dictionary(df):
  unique_words = {}

  for row in df:
    for word in row.split():
      #if the word is encountered for the first time add to dict as key and set its value to 0
      unique_words.setdefault(word,0)
      #increase the value (i.e the count) of the word by 1 every time it is encountered
      unique_words[word] += 1

  return unique_words

## JanFeb2020

In [32]:
edges = pd.read_csv('/content/edgelist_China_USA_JanFeb2020.csv')
edges_China = pd.read_csv('/content/edgelist_China_JanFeb2020.csv')
edges_USA = pd.read_csv('/content/edgelist_USA_JanFeb2020.csv')

In [33]:
net, net_df = network_from_edges(edges)
net_China, net_df_China = network_from_edges(edges_China)
net_USA, net_df_USA = network_from_edges(edges_USA)

In [34]:
G = get_graph(net)
G_China = get_graph(net_China)
G_USA = get_graph(net_USA)

In [35]:
print('Nodes: ',len(G.nodes()))
print('Edges: ',len(G.edges()))
print('Is connected: ',nx.is_connected(G))
print()
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()

Nodes:  656
Edges:  3575
Is connected:  False

China:
Nodes:  306
Edges:  1398
Is connected:  False

USA:
Nodes:  476
Edges:  1797
Is connected:  True



### Girvan_Newman

In [11]:
communities_generator_gn = community.girvan_newman(G)
top_level_communities_gn = next(communities_generator_gn)
next_level_communities_gn = next(communities_generator_gn)
c_next_gn = sorted(map(sorted, next_level_communities_gn))
c_top_gn = sorted(map(sorted, top_level_communities_gn))

In [12]:
print(c_next_gn)



### Bipartition
Partition a graph into two blocks using the Kernighan–Lin algorithm.

In [13]:
communities_b = community.kernighan_lin_bisection(G)

In [14]:
sum_China_0 = 0
sum_USA_0 = 0
for key in communities_b[0]:
  if key in list(G_China.nodes()):
    sum_China_0 += 1
  if key in list(G_USA.nodes()):
    sum_USA_0 += 1
  
sum_China_1 = 0
sum_USA_1 = 0
for key in communities_b[1]:
  if key in list(G_China.nodes()):
    sum_China_1 += 1
  if key in list(G_USA.nodes()):
    sum_USA_1 += 1


In [15]:
print(sum_China_0 /len(communities_b[0]))
print(sum_USA_0/len(communities_b[0]))
print(sum_China_1/len(communities_b[1]))
print(sum_USA_1/len(communities_b[1]))

0.22865853658536586
0.551829268292683
0.7042682926829268
0.899390243902439


In [17]:
period = '_JanFeb2020'
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')

In [21]:
text_China = extract_text(China)
text_USA = extract_text(USA)

0 / 2459
1000 / 2459
2000 / 2459
0 / 3185
1000 / 3185
2000 / 3185
3000 / 3185


In [24]:
freq_dict_China = frequency_dictionary(text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

In [25]:
def community_check(community, dict_1, dict_2, G_1, G_2, thr):
  sum_1 = 0
  sum_2 = 0
  sum_12 = 0
  for key in community:
    if (key in list(dict_1.keys())) and (key in list(dict_2.keys())):
      if dict_1[key] - dict_2[key] > thr:
        sum_1 += 1
      elif dict_1[key] - dict_2[key] < -thr:
        sum_2 += 1  
      else:
        sum_12 += 1 
    elif key in list(G_1.nodes()):
      sum_1 += 1
    elif key in list(G_2.nodes()):
      sum_2 += 1
  return sum_1, sum_2, sum_12

In [28]:
threshold = 10
sum_China_0, sum_USA_0, sum_China_USA_0 = community_check(communities_b[0], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
sum_China_1, sum_USA_1, sum_China_USA_1 = community_check(communities_b[1], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)

In [29]:
print(sum_China_0 /len(communities_b[0]))
print(sum_USA_0/len(communities_b[0]))
print(sum_China_USA_0/len(communities_b[0]))

print(sum_China_1/len(communities_b[1]))
print(sum_USA_1/len(communities_b[1]))
print(sum_China_USA_1/len(communities_b[1]))

0.125
0.25
0.551829268292683
0.24695121951219512
0.35060975609756095
0.4024390243902439


### Modularity-based communities
Find communities in G using greedy modularity maximization.

In [46]:
communities_mod = community.greedy_modularity_communities(G)
# communities_naive_mod = community.naive_greedy_modularity_communities(G)

In [41]:
for comm in communities_mod:
 print(comm)

frozenset({'recently', 'inside', 'top', 'air', 'change', 'system', 'plane', 'food', 'hard', 'california', 'bad', 'amid', 'talk', 'affect', 'impact', 'dow', 'baby', 'sign', 'arrive', 'major', 'outbreak', 'early', 'best', 'speak', 'diagnose', 'globally', 'infected', 'long', 'within', 'identify', 'social', 'department', 'together', 'announce', 'asia', 'back', 'birth', 'become', 'service', 'make', 'read', 'china', 'hurt', 'respiratory', 'carry', 'autonomous', 'shanghai', 'industry', 'term', 'jump', 'market', 'wall', 'way', 'effective', 'apple', 'order', 'life', 'wang', 'nation', 'extend', 'isolation', 'fund', 'large', 'meeting', 'possible', 'growth', 'front', 'must', 'employee', 'may', 'figure', 'experience', 'much', 'revenue', 'expect', 'global', 'east', 'line', 'point', 'flu', 'leader', 'since', 'discover', 'meet', 'ambassador', 'information', 'drone', 'delivery', 'issue', 'animal', 'challenge', 'provincial', 'price', 'low', 'reporter', 'big', 'far', 'investor', 'australia', 'implement',

In [None]:
for comm in communities_naive_mod:
 print(communities_naive_mod)

### Louvain Community Detection
Find the best partition of a graph using the Louvain Community Detection Algorithm.

In [55]:
# communities_louvain = community.louvain.louvain_communities(G)
# partitions_louvain = community.louvain.louvain_partitions(G)

communities_louvain = community_louvain.best_partition(G)

# modularity_louvain = community.modularity(communities_louvain, G) # ERROR: not a partition
# print("The modularity Q based on networkx is {}".format(modularity_louvain))

NotAPartition: ignored

In [53]:
communities

{'aboard': 6,
 'accord': 3,
 'acid': 6,
 'across': 0,
 'additional': 1,
 'administration': 1,
 'admit': 4,
 'advance': 0,
 'advise': 0,
 'advisory': 0,
 'affect': 1,
 'agency': 1,
 'aid': 0,
 'air': 1,
 'airport': 1,
 'airway': 1,
 'alarm': 1,
 'allow': 6,
 'almost': 1,
 'alone': 0,
 'already': 0,
 'also': 6,
 'ambassador': 0,
 'amid': 1,
 'among': 1,
 'anger': 1,
 'animal': 0,
 'announce': 0,
 'another': 7,
 'answer': 1,
 'anti': 1,
 'anyone': 1,
 'appear': 1,
 'apple': 1,
 'area': 1,
 'around': 2,
 'arrive': 0,
 'asia': 1,
 'asian': 1,
 'ask': 0,
 'australia': 1,
 'australian': 1,
 'authority': 7,
 'autonomous': 0,
 'avoid': 1,
 'away': 6,
 'baby': 4,
 'back': 0,
 'bad': 1,
 'ban': 0,
 'bank': 1,
 'barcelona': 1,
 'base': 1,
 'basis': 0,
 'batch': 7,
 'battle': 0,
 'become': 1,
 'bed': 4,
 'begin': 0,
 'beijing': 1,
 'believe': 0,
 'best': 0,
 'beyond': 2,
 'big': 1,
 'bill': 1,
 'billion': 1,
 'birth': 2,
 'blow': 1,
 'board': 6,
 'border': 0,
 'break': 0,
 'brief': 2,
 'bring': 7,
