# Init: Load Libraries and Functions

In [1]:
from collections import defaultdict 
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string
from textblob import TextBlob  
import os
import re


#random seed for reproducibility
np.random.seed(67)

In [2]:
def polarity_scorer(input_text):
    """This function operates on a column in a data frame using apply().
    Takes a column as an input and returns a tuple of the polarity score and subjectivity score
    use .tolist() to split into separate columns, like here: https://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe"""
    text = TextBlob(input_text)
    polarity_score = text.sentiment.polarity
    subjectivity_score = text.sentiment.subjectivity
    
    return polarity_score, subjectivity_score

In [3]:
nlp = spacy.load('en_core_web_lg')
punctuations = string.punctuation #this is a python module which contains all the punctuations characters in English (and probably other languages too)
stopwords = list(STOP_WORDS)

def spacy_tokenizer(input_text):
    """removes stop words and punctuation from a document, converts all tokens to lower case
    and combines all tokens into one string.
    used in this example it appends a new column to a dataframe through apply()"""
    processed_text = re.sub(r"http\S+", '', input_text) # remove URLS, https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
    mytokens = nlp(processed_text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [4]:
# removing stopwords, it's this easy: https://medium.com/@makcedward/nlp-pipeline-stop-words-part-5-d6770df8a936

def token_parser(input_text):
    """This function creates a table with the text token and parts of a speech for a piece of text
    Builds one to many relationships by expanding all the tokens within a single document"""
    table = pd.DataFrame()
    doc = nlp(input_text)
    text = [token.text for token in doc if not token.is_stop]
    pos = [token.pos_ for token in doc if not token.is_stop]
    table['text'] = text
    table['pos'] = pos
    
    return table

In [5]:
def wordmatrix_to_dataframe(wordmatrix, feature_names):

    """This function takes a word matrix from SK-Learn and turns into a dataframe"""
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wordmatrix)]
    df = pd.DataFrame(data=wordmatrix.toarray(), #index = doc_names,
                     columns = feature_names)
    
    return df

In [6]:
def create_adjacency_list(adjacency_matrix):
    
    """This function takes a dataframe of  a dataframe"""
    table_out = pd.DataFrame(columns = ['id', 'target'])
    
    for i in range(len(adjacency_matrix)):

        filtered_table = pd.DataFrame(adjacency_matrix.iloc[i])
        filtered_table = filtered_table.transpose()
        filtered_table = filtered_table.loc[:, ~(filtered_table==0).any(axis=0)]
        
        
        #for column in filtered_table.columns:
        temp_table = pd.DataFrame(columns = ['id', 'target'])
        source = [filtered_table.index.values[0] for column in filtered_table.columns]
        target = [column for column in filtered_table.columns]
        
        temp_table['id'] = source
        temp_table['target'] = target
            
        table_out = table_out.append(temp_table)
 
    return table_out
    

# Load files and tokenize

In [7]:
path = 'exports/'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]


In [8]:
files

['@GlobalGiving_tweets.csv',
 '@RESCUEorg_tweets.csv',
 '@RedCross_tweets.csv',
 '@Refugees_tweets.csv',
 '@UNHumanRights_tweets.csv',
 '@ICRC_tweets.csv',
 '@SavetheChildren_tweets.csv',
 '@UNICEF_tweets.csv',
 '@MSF_tweets.csv',
 '@WCKitchen_tweets.csv']

In [9]:
text_to_drop = '_tweets.csv'
all_files = pd.DataFrame(columns = ['id', 'created_at', 'text', 'brand'])

for file in files:
    each_file = pd.read_csv(path+file)
    each_file['brand'] = file.replace(text_to_drop, '') #https://www.journaldev.com/23674/python-remove-character-from-string
    all_files = all_files.append(each_file)
    
all_files.shape

(23687, 4)

In [10]:
all_files.created_at = pd.to_datetime(all_files.created_at)

In [11]:
all_files = all_files.loc[all_files.created_at > "2022-02-15"]

In [12]:
all_files.to_csv('data_backup.csv', index = False)

In [13]:
data = all_files.copy()

In [14]:
data.head()

Unnamed: 0,id,created_at,text,brand
0,1500215577817886725,2022-03-05 21:04:01+00:00,UPDATE: Tabletochki Charity Foundation is one ...,@GlobalGiving
1,1499909560030662661,2022-03-05 00:48:00+00:00,The link between #war and #hunger is clear—yet...,@GlobalGiving
2,1499834444349386758,2022-03-04 19:49:31+00:00,@ItsFangs Thank you so much for showing your s...,@GlobalGiving
3,1499833815874818052,2022-03-04 19:47:01+00:00,@Kellyrei007 Thank you so much for showing you...,@GlobalGiving
4,1499833431584350216,2022-03-04 19:45:30+00:00,@jerryg125 Thank you for sharing! 🙌 🙌 #StandWi...,@GlobalGiving


In [15]:
data.brand.value_counts()

@WCKitchen          367
@Refugees           205
@UNICEF             158
@ICRC               134
@GlobalGiving       105
@RedCross           101
@SavetheChildren     85
@MSF                 62
@UNHumanRights       62
@RESCUEorg           59
Name: brand, dtype: int64

## Sentiment Analysis 

In [16]:
# calls functions through apply(), returns a tuple then splits the results into 2 columns
data.text = data.text.astype('str') 

data[['polarity_score', 'subjectivity_score']] = pd.DataFrame(data.text.apply(polarity_scorer).tolist(), index = data.index)

data.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score
0,1500215577817886725,2022-03-05 21:04:01+00:00,UPDATE: Tabletochki Charity Foundation is one ...,@GlobalGiving,0.34375,0.5125
1,1499909560030662661,2022-03-05 00:48:00+00:00,The link between #war and #hunger is clear—yet...,@GlobalGiving,0.0625,0.5
2,1499834444349386758,2022-03-04 19:49:31+00:00,@ItsFangs Thank you so much for showing your s...,@GlobalGiving,0.075,0.325
3,1499833815874818052,2022-03-04 19:47:01+00:00,@Kellyrei007 Thank you so much for showing you...,@GlobalGiving,0.25,0.2
4,1499833431584350216,2022-03-04 19:45:30+00:00,@jerryg125 Thank you for sharing! 🙌 🙌 #StandWi...,@GlobalGiving,0.0,0.0


## Tokenizer

In [17]:
data['processed_text'] = data.text.apply(spacy_tokenizer)

In [18]:
data.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score,processed_text
0,1500215577817886725,2022-03-05 21:04:01+00:00,UPDATE: Tabletochki Charity Foundation is one ...,@GlobalGiving,0.34375,0.5125,update tabletochki charity foundation 20 nonpr...
1,1499909560030662661,2022-03-05 00:48:00+00:00,The link between #war and #hunger is clear—yet...,@GlobalGiving,0.0625,0.5,link war hunger clear — overlook ukraine russi...
2,1499834444349386758,2022-03-04 19:49:31+00:00,@ItsFangs Thank you so much for showing your s...,@GlobalGiving,0.075,0.325,@itsfang thank support help community ukraine ...
3,1499833815874818052,2022-03-04 19:47:01+00:00,@Kellyrei007 Thank you so much for showing you...,@GlobalGiving,0.25,0.2,@kellyrei007 thank support standwithukraine
4,1499833431584350216,2022-03-04 19:45:30+00:00,@jerryg125 Thank you for sharing! 🙌 🙌 #StandWi...,@GlobalGiving,0.0,0.0,@jerryg125 thank share 🙌 🙌 standwithukraine


In [19]:
data.to_csv('data_backup.csv', index = False)

## creating entities out of tokenized text

In [20]:
entities_table = pd.DataFrame(columns=['text', 'pos', 'id'])

for i in range(len(data)):
    tokens_table = token_parser(data.iloc[i].processed_text)  
    tokens_table['id'] = data.iloc[i].id  
    entities_table = entities_table.append(tokens_table)

entities_table = entities_table[-entities_table['pos'].str.contains('SPACE')] # removing spaces, punctuation
entities_table = entities_table[-entities_table['pos'].str.contains('PUNCT')]
entities_table.reset_index(drop = True, inplace = True)

entities_table.shape

(17949, 3)

In [21]:
entities_table.head()

Unnamed: 0,text,pos,id
0,update,VERB,1500215577817886725
1,tabletochki,PROPN,1500215577817886725
2,charity,PROPN,1500215577817886725
3,foundation,PROPN,1500215577817886725
4,20,NUM,1500215577817886725


In [22]:
entities_table.to_csv('data_entities.csv', index = False)

## Vectorizer to build matrix

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score 


In [24]:
#processed_tweets = pd.read_csv('analysis output v2.csv')
#processed_tweets.processed_text = processed_tweets.processed_text.astype('str')

In [25]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=0.003, max_df=0.9, stop_words='english', 
                             lowercase=True, token_pattern='[a-z0-9]{2,}')  

# reference to using min and max df arguments
# https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer

# token pattern captures any non-whitespace character, e.g. hashtags starting with numbers

data_vectorized = vectorizer.fit_transform(data["processed_text"])

In [26]:
#https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#
    
n_components = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for i in n_components:
    clusterer = KMeans(n_clusters=i, random_state = 37)
    cluster_labels = clusterer.fit_predict(data_vectorized)

    silhouette_avg = silhouette_score(data_vectorized, cluster_labels)
    print("For number of topics: ", i,
          "the average silhouette score is: ", silhouette_avg)
    


For number of topics:  2 the average silhouette score is:  0.1368332241956945
For number of topics:  3 the average silhouette score is:  0.13662546109927054
For number of topics:  4 the average silhouette score is:  0.017949205646379657
For number of topics:  5 the average silhouette score is:  0.026059512392076645
For number of topics:  6 the average silhouette score is:  0.007152305203992406
For number of topics:  7 the average silhouette score is:  0.013990016253839494
For number of topics:  8 the average silhouette score is:  0.022446033868902888
For number of topics:  9 the average silhouette score is:  -0.05167175488566481
For number of topics:  10 the average silhouette score is:  0.03481756436550594
For number of topics:  11 the average silhouette score is:  0.0014216143414394642


# Export matrix to adjacency list

In [27]:
# calling word matrix to dataframe function
features = vectorizer.get_feature_names()
len(features)

610

In [28]:
adjacency_matrix = wordmatrix_to_dataframe(data_vectorized, features)
documents_ids = data['id']
adjacency_matrix.index = documents_ids

In [29]:
adjacency_matrix

Unnamed: 0_level_0,000,10,100,11,12,15,16,18,19,1james,...,work,worker,world,worried,wound,year,yemen,yesterday,young,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1500215577817886725,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1499909560030662661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1499834444349386758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1499833815874818052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1499833431584350216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494324123043913737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1494060952815243264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1493982341722689538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1493681712630403077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
adjacency_matrix.to_csv('adjacency_matrix.csv', index=True)

In [31]:
adjacency_list = create_adjacency_list(adjacency_matrix)
adjacency_list

Unnamed: 0,id,target
0,1500215577817886725,20
1,1500215577817886725,critical
2,1500215577817886725,donate
3,1500215577817886725,emergency
4,1500215577817886725,hungary
...,...,...
13,1493639941867200514,relief
14,1493639941867200514,team
15,1493639941867200514,travel
16,1493639941867200514,wck


In [32]:
adjacency_list.rename(columns={'id':'Source','target':'Target'}, inplace = True)

In [33]:
adjacency_list.to_csv('adjacency_list_v1.csv', index = False)

In [34]:
adjacency_list_for_merging = adjacency_list.rename(columns={'Source':'id','target':'Target'}).merge(data[['id','brand']], on = 'id')

In [35]:
adjacency_list_for_merging

Unnamed: 0,id,Target,brand
0,1500215577817886725,20,@GlobalGiving
1,1500215577817886725,critical,@GlobalGiving
2,1500215577817886725,donate,@GlobalGiving
3,1500215577817886725,emergency,@GlobalGiving
4,1500215577817886725,hungary,@GlobalGiving
...,...,...,...
11310,1493639941867200514,relief,@WCKitchen
11311,1493639941867200514,team,@WCKitchen
11312,1493639941867200514,travel,@WCKitchen
11313,1493639941867200514,wck,@WCKitchen


In [36]:
adjacency_list_for_merging.to_csv('adjacency_list_for_merging.csv', index = False)