# Libraries 

In [18]:
# !pip install transformers
!pip install networkx
# !pip install "tensorflow>=2.0.0"
# !pip install --upgrade tensorflow-hub
!pip install -U sentence-transformers
!pip install python-louvain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import matplotlib.pyplot as plt
import json
import re
import pandas as pd
import networkx as nx
from collections import Counter
import numpy as np
from sentence_transformers import SentenceTransformer, util
from community import community_louvain
from networkx.algorithms import community
import math
import random
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')



# Initial Graph Creation 

In [20]:
#df = pd.read_csv("./hashtag_donaldtrump_cleaned.csv", lineterminator='\n')
#df = pd.read_csv("./trump_stance_train_public.csv")

df = pd.read_csv("./trump_stance_dataset.csv")
df['text'] = df['text'].apply(lambda x: x.replace('@USER',''))
df = df.loc[(df['label'] == "FAVOR") | ( df['label'] == "AGAINST")]
cleaned_tweets = list(df['text'])
cleaned_labels = list(df['label'])

def find_hashtags(sents):
    p = re.compile(r'(\#\w+)')
    return list(p.findall(sents))

hashtweets = list(map(find_hashtags, cleaned_tweets))

i = 0
hastags_unique = set()
for tweets in hashtweets:
    for tweet in tweets:
        hastags_unique.add(tweet)
        

In [21]:
print(len(hastags_unique))

1267


In [22]:
G = nx.Graph()
weights = nx.get_edge_attributes(G,'weight')
for hashstag in hastags_unique:
    G.add_node(hashstag)
print(G.number_of_nodes())

for ht_set in hashtweets:
    for ht1 in ht_set:
        for ht2 in ht_set:
            if ht1 != ht2:
                if G.has_edge(ht1,ht2):
                    w = G[ht1][ht2]['weight'] + 1
                    ##print(ht1,ht2)
                    nx.set_edge_attributes(G, {(ht1, ht2): {"weight": w}}) 
                else:
                    G.add_edge(ht1,ht2,weight=1)    

count_no_neigh =  0 
for n1 in list(G.nodes(data=True)):
    if len(list(G.neighbors(n1[0]))) <= 3:
        G.remove_node(n1[0])
        count_no_neigh += 1

print("number removed {count_no:.2f}".format(count_no= count_no_neigh))


1267
number removed 402.00


In [23]:
# unpack remove
removed_total_edge = 0
for node_details in list(G.edges(data=True)):
  try:
    if node_details[2]['weight'] <= 2:
      e = (node_details[0], node_details[1])
      removed_total_edge+= 1
      G.remove_edge(*e)
  except:
    print(node_details[0],node_details[1])
   

print("removal of {removed:.2f} edges".format(removed=removed_total_edge))
print(G.number_of_nodes())

# retrieve new nodes
New_nodes = []
for node in G.nodes(data=True):
  New_nodes.append(node[0])

removal of 4751.00 edges
865


# Building Sbert


In [24]:
sentences_dict = {}
factor_dict ={}
factor_raw_dict = {}

def calculate_ratio_support(labels_count):
  alpha = 0.5
  F = labels_count.count('FAVOR')
  A = labels_count.count('AGAINST')
  C = ""
  factor = 0
  if F == 0 or A == 0:
    return 1 
  if F > A:
    factor =  1 - alpha * (A/F)
  elif A > F:
    factor = 1 - alpha * (F/A)
  else:
    factor = 0.5 
  return factor
  

for hashtags in range(0,len(list(New_nodes))):
  cumu_sents = []
  labels_sents = []
  for ctweets in range(0, len(cleaned_tweets)):
    if list(New_nodes)[hashtags] in cleaned_tweets[ctweets]:
      cumu_sents.append(cleaned_tweets[ctweets].replace(list(New_nodes)[hashtags], ''))
      labels_sents.append(cleaned_labels[ctweets])
    factor_sents = calculate_ratio_support(labels_sents)
    sentences_dict[list(New_nodes)[hashtags]] = cumu_sents
    factor_dict[list(New_nodes)[hashtags]] = factor_sents
    factor_raw_dict[list(New_nodes)[hashtags]] = Counter(labels_sents)

def calculate_pooled_avg(embeddings):
    embed_converted = np.array([np.array(xi) for xi in embeddings])
    pooled_average = np.zeros(len(embed_converted[0]))
    for i in range(0, len(embed_converted[0])):
      for j in range(0,len(embed_converted)):
        pooled_average[i] += embed_converted[j][i]
      pooled_average[i] = pooled_average[i]/len(embed_converted)
    return pooled_average

def embeddings_churn(sentences_dict):
  embed_converted = {}
  for hashtag, sentences in sentences_dict.items():
      #print("Sentence:", sentences)
      embeddings = model.encode(sentences)
      #print(embeddings)
      embed_converted[hashtag] = calculate_pooled_avg(embeddings)
  return embed_converted

embeddings_calculated = embeddings_churn(sentences_dict)
print(len(embeddings_calculated))

# apply factor 
for hashtag, embedding in embeddings_calculated.items():
  embeddings_calculated[hashtag] = embedding * factor_dict[hashtag]


865


In [25]:
# # can save if you need to 
# import json
# with open('sentences_dict.txt', 'w') as file:
#      file.write(json.dumps(sentences_dict)) # use `json.loads` to do the reverse

# with open('factor_dict.txt', 'w') as file:
#      file.write(json.dumps(factor_dict)) # use `json.loads` to do the reverse

# Final Contextual Hashtag Selection

In [26]:
# embeddings_calculated

G = nx.petersen_graph()
weights = nx.get_edge_attributes(G,'weight')
for hashstag in New_nodes:
    G.add_node(hashstag)

print(G.number_of_nodes())


def find_hashtags(sents):
    p = re.compile(r'(\#\w+)')
    return list(set(New_nodes) & set(list(p.findall(sents))))
    
hashtweets = list(map(find_hashtags, cleaned_tweets))

for ht_set in hashtweets:
    for ht1 in ht_set:
        for ht2 in ht_set:
            if ht1 != ht2:
                if G.has_edge(ht1,ht2):
                    w = G[ht1][ht2]['weight'] * 2
                    ##print(ht1,ht2)
                    nx.set_edge_attributes(G, {(ht1, ht2): {"weight": w}}) 
                else:
                    G.add_edge(ht1,ht2,weight=util.cos_sim(embeddings_calculated[ht1], embeddings_calculated[ht2]))


for node_details in list(G.edges(data=True)):
  if "weight" in node_details[2]:
      nx.set_edge_attributes(G, {(node_details[0], node_details[1]): {"weight": np.log(node_details[2]['weight'].numpy()[0][0])}})

875


In [27]:
# Opposites

opp_hash = []
highly_similar = []
high_value = []
low_value = []

for node_details in list(G.edges(data=True)):
  if "weight" in node_details[2]:
      if node_details[2]['weight'] == float("-inf"):
        opp_hash.append(node_details)
      
      if node_details[2]['weight'] >= 3  and node_details[2]['weight'] <= 3.5:
        highly_similar.append(node_details)
      
      if node_details[2]['weight'] >= 10:
        high_value.append(node_details)

      if node_details[2]['weight'] <= 0.05:
        low_value.append(node_details)



## View Values

In [28]:
random.sample(low_value, 10)

[('#moscowmitchmcconnell',
  '#votebidenharris',
  {'weight': -0.30461969326740246}),
 ('#rncbullshit', '#rnc2020', {'weight': -0.15930961813750266}),
 ('#bluenomatterwho', '#bluewave2020', {'weight': -0.2840932418138804}),
 ('#resisters', '#usps', {'weight': -0.02235696172120139}),
 ('#trump2020', '#stimuluschecks', {'weight': -0.3154900929005452}),
 ('#palpatine', '#trump', {'weight': 0.004730048643476034}),
 ('#vadems', '#bidenharris2020', {'weight': -0.030286274701338246}),
 ('#trumphatesourtroops', '#voteblue', {'weight': 0.007737564630753352}),
 ('#joebiden', '#nicetry', {'weight': 0.019561933844617335}),
 ('#bidenharris2020', '#mvpharris', {'weight': -0.11476568458392017})]

In [29]:
random.sample(high_value, 10)

[('#maga', '#maga2020', {'weight': 31.14295328389213}),
 ('#trump2020', '#kag2020', {'weight': 14.436090408271937}),
 ('#americafirst', '#trump2020', {'weight': 14.423378353260086}),
 ('#trump2020', '#debates2020', {'weight': 15.706115489721649}),
 ('#maga2020', '#kag2020', {'weight': 10.320140906684134}),
 ('#maga', '#trump2020', {'weight': 94.90430131032821}),
 ('#trump2020landslide', '#trump2020', {'weight': 21.37371655362981}),
 ('#trump2020landslide', '#maga', {'weight': 11.718115241374608}),
 ('#votebluetosaveamerica',
  '#bidenharris2020',
  {'weight': 15.864809380997025}),
 ('#maga', '#trump', {'weight': 24.143961855505452})]

In [30]:
random.sample(highly_similar, 10)

[('#biden2020', '#votebluetosaveamerica2020', {'weight': 3.329124256417508}),
 ('#usa', '#trump', {'weight': 3.199428300629539}),
 ('#votebluetosaveamerica',
  '#bidenharris2020landslide',
  {'weight': 3.1895369254100934}),
 ('#votebluetosaveamerica',
  '#voteoutcorruptgop',
  {'weight': 3.0639912976384034}),
 ('#biden2020', '#voteblue', {'weight': 3.3849370108485073}),
 ('#americafirst', '#kag', {'weight': 3.368441753117243}),
 ('#votebidenharris2020', '#bidenharris2020', {'weight': 3.33817283053185}),
 ('#trump2020', '#election2020', {'weight': 3.253073332569817}),
 ('#bidenharris', '#bidenwonthedebate', {'weight': 3.189086663320908}),
 ('#america', '#americafirst', {'weight': 3.4294012394533686})]

In [None]:
random.sample(opp_hash, 10)

In [32]:
# Ratio problem 
factor_raw_dict['#trump'], factor_raw_dict['#trump2020']

(Counter({'AGAINST': 143, 'FAVOR': 490}),
 Counter({'AGAINST': 29, 'FAVOR': 465}))

## Reduce and Group Detection

In [33]:
count_no_neigh =  0 
for n1 in list(G.nodes(data=True)):
    if len(list(G.neighbors(n1[0]))) <= 3:
        G.remove_node(n1[0])
        count_no_neigh += 1
print("number removed {count_no:.2f}".format(count_no= count_no_neigh))

removed_total_edge = 0
for node_details in list(G.edges(data=True)):
  try:
    if node_details[2]['weight'] < 2:
      e = (node_details[0], node_details[1])
      removed_total_edge+= 1
      G.remove_edge(*e)
  except:
    print(node_details[0],node_details[1])
   
print("removal of {removed:.2f} edges".format(removed=removed_total_edge))

communities_generator = community.girvan_newman(G)
#first compute the best partition
partition = community_louvain.best_partition(G,  weight='weight')
data_partition = pd.DataFrame(partition.items(), columns = ['hash','value'] )
data_partition['value'] = data_partition['value'].apply(lambda x:str(x))
data_partition.head()

number removed 12.00
removal of 5543.00 edges


Unnamed: 0,hash,value
0,#thegreatawaking,0
1,#miga,1
2,#republican,2
3,#nfl,3
4,#northcarolina,4


## Group Selection (Final)
Filter for the relevant group 

In [34]:
value_df = pd.DataFrame(data_partition.groupby('value').agg('count').reset_index())
value_df = value_df.sort_values(by='hash',ascending=False)
value_df.head(10)

Unnamed: 0,value,hash
13,11,55
98,187,49
457,51,11
73,164,6
82,172,5
415,472,5
45,139,5
96,185,5
672,73,5
157,24,4


In [40]:
# Use this to filter for a specific partition 
# value is group number and hash is the number of nodes 
data_partition.loc[data_partition['value'] == '187']

Unnamed: 0,hash,value
29,#antifa,187
78,#blacksfortrump,187
83,#trump2020landslide,187
164,#godwins,187
189,#blmterrorists,187
193,#america,187
196,#maga,187
210,#democratsaredestroyingamerica,187
217,#trumptrain,187
230,#4moreyears,187


In [43]:
sorted_values = sorted(partition.values()) # Sort the values
sorted_dict = {}
for i in sorted_values:
    for k in partition.keys():
        if partition[k] == i:
            sorted_dict[k] = partition[k]
            break

In [45]:
# use to select a parition that suit the problem in this case a cluster that shows strong support for trump 
trump_df = data_partition.loc[data_partition['value'] == '187']
trump_df_hash = list(trump_df['hash'])
final_trump_hash = []
for i in trump_df_hash:
  if(factor_raw_dict[i]['FAVOR'] > factor_raw_dict[i]['AGAINST'] * 6 )and (factor_raw_dict[i]['FAVOR'] > 6): 
    print(factor_raw_dict[i], i )
    final_trump_hash.append(i)


Counter({'FAVOR': 7, 'AGAINST': 1}) #antifa
Counter({'FAVOR': 8}) #blacksfortrump
Counter({'FAVOR': 11}) #democratsaredestroyingamerica
Counter({'FAVOR': 9, 'AGAINST': 1}) #trumptrain
Counter({'FAVOR': 8}) #4moreyears
Counter({'FAVOR': 31, 'AGAINST': 2}) #votered
Counter({'FAVOR': 20, 'AGAINST': 3}) #americafirst
Counter({'FAVOR': 465, 'AGAINST': 29}) #trump2020
Counter({'FAVOR': 10}) #wwg1wga
Counter({'FAVOR': 10}) #votered2020
Counter({'FAVOR': 12}) #draintheswamp
Counter({'FAVOR': 47, 'AGAINST': 4}) #maga2020
Counter({'FAVOR': 73, 'AGAINST': 2}) #kag
Counter({'FAVOR': 7}) #walkawayfromdemocrats
Counter({'FAVOR': 16, 'AGAINST': 1}) #voteredtosaveamerica
Counter({'FAVOR': 7}) #fourmoreyears
Counter({'FAVOR': 24}) #kag2020
Counter({'FAVOR': 13}) #walkaway
Counter({'FAVOR': 8}) #patriots


### Final Selected Hashtags for Scaping 

In [46]:
print(final_trump_hash)

['#antifa', '#blacksfortrump', '#democratsaredestroyingamerica', '#trumptrain', '#4moreyears', '#votered', '#americafirst', '#trump2020', '#wwg1wga', '#votered2020', '#draintheswamp', '#maga2020', '#kag', '#walkawayfromdemocrats', '#voteredtosaveamerica', '#fourmoreyears', '#kag2020', '#walkaway', '#patriots']


In [42]:
# # Save groups if needed
# with open('sorted_dict_community.txt', 'w') as file:
#      file.write(json.dumps(sorted_dict)) # use `json.loads` to do the reverse