# Preparation

## Library installation/import

Install and import libraries that are used in multiple sections of the pipeline.

In [2]:
import pandas as pd

In [3]:
!pip install spacy==3.0
!pip install tweepy
!pip install wordcloud



## Configuring Twitter API keys

Please note that the API keys below are the course leader's own API keys. You are allowed to use it to do some small tests, but please be careful because all students in the class now have a copy of it, and hence the limits can be easily exceeded.

If your group has decided to use Twitter data, you can [apply for your own keys](https://developer.twitter.com/en/apply-for-access).

In [None]:
import tweepy

auth = tweepy.OAuthHandler('fjkruboMzTLE4BLE7FmEpkWpw', 'jDobYz45Ksc3uMHoD2QnyZK60NwfRZnWIDVmyPtUGLkiOUqfGl')
auth.set_access_token('1374773661861830658-lPZKU2qeuepRxVfWs5OxRoZd6XGzrH', '84k6xIDIMrt5mzPFLCoesD0WM9bpk8d3bAKaNonbcuT0s')

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

## Downloading of new data set

**IMPORTANT NOTE**: Please do not run the cell below unless intending to download a new data set.

Make sure that you change the parameters.

Also, check the [Tweepy API reference](https://docs.tweepy.org/en/latest/api.html) to find out about other ways through which you can retrieve tweets, e.g., by specifying usernames.

In [None]:
# Collect tweets
query = "#notoracism" + " -filter:retweets"
cutoff_date = "2021-01-01"
tweets = tweepy.Cursor(api.search, q=query, lang="en", since=cutoff_date).items(1000)

tweets_list = [[tweet.created_at, tweet.user.screen_name, tweet.user.location, tweet.text] for tweet in tweets]

tweets_df = pd.DataFrame(data=tweets_list, columns=['date', 'user', 'location', 'text'])

# A good idea to save downloaded tweets as CSV
tweets_df.to_csv ('current_set.csv', quotechar='"', encoding='utf8', index = False, header=True)

# Data pre-processing

## Cleaning

Below we provide some code for text cleaning. However, we encourage you to think of other ways to clean your data, e.g., by removing hashtags, removing usernames, removing duplicate tweets?

In [38]:
#installing tweet-preprocessor
!pip install tweet-preprocessor
!pip install wordsegment
!pip install autocorrect
import preprocessor as p
from wordsegment import load, segment
import re
from autocorrect import Speller



In [58]:
# comment if not using pre saved data set 
tweets_df = pd.read_csv('current_set.csv', quotechar='"', encoding='utf8')
cleaned_tweets = []

# using the tweet-processor library to clean the tweets 
# library removes all links and mentions and hastags

# include hastags and remove urls and emojis and mentions
p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION)
for tweet in tweets_df['text']:
    cleaned_tweets.append(p.clean(tweet))

# adding back to data frame
tweets_df['text_processed'] = cleaned_tweets

#removing punctuation
tweets_df['text_processed'] = tweets_df['text_processed'].map(lambda x: re.sub('[,\\.!?]', '', x))

# Remove unnecessary line breaks
tweets_df['text_processed'] = tweets_df['text_processed'].map(lambda x: re.sub(r"\n", '', x))

# loading the segmenter to deal with hastags
# hastags are just a multiple words joined together
# we thus seperate the words and join it to the tweet
load()
#store hastags in different column 
tweets_df['hashtag'] =  tweets_df['text_processed'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x)) #creating a new column
# remove hastag from tweet
tweets_df['text_processed'] = tweets_df['text_processed'].map(lambda x: re.sub(r"#(\w+)", '', x))

# ---- segmentation of hastag ----
# joining this to the original tweet
for index , tag in enumerate(tweets_df['hashtag']):
    segmented_hashtag = ' '.join(segment(' '.join(tag)))
    tweets_df['text_processed'][index] = tweets_df['text_processed'][index] + " " + segmented_hashtag                  
# ------

# Convert the titles to lowercase
tweets_df['text_processed'] = tweets_df['text_processed'].map(lambda x: x.lower())

#removing extra whitespaces 
tweets_df['text_processed'] = tweets_df['text_processed'].map(lambda x: ' '.join(x.split()))


In [57]:
display(tweets_df.head())

Unnamed: 0,date,user,location,text,text_processed,hashtag
0,2021-03-06 10:35:34,RWDMolenbeek,"Sint-Jans-Molenbeek, België",🏳️‍🌈Matchday: R.W.D.M - @KMSKDeinze 🏟E. Machte...,matchday: rwdm - e machtens stadion 20:45 no t...,[#NOtoRacism]
1,2021-03-06 10:24:42,motsetse_sello,,I've never seen I a white Human ever since I w...,i've never seen i a white human ever since i w...,[]
2,2021-03-06 07:24:48,amirjon4628,,"Hi army,This is Iranian armys\nWe just found m...",hi armythis is iranian armys we just found meh...,[]
3,2021-03-06 07:12:20,future_nostalgi,,"Hi army,This is Iranian armys\nWe just found m...",hi armythis is iranian armys we just found meh...,[]
4,2021-03-06 07:09:39,NJ7twt,,@Nili10724948 @MehradHidden Whenever the hater...,whenever the hater starts to hate bts does not...,[]


## Exploration using a word cloud

Generating a word cloud is one way by which you can check whether your data needs any further cleaning.

In [None]:
from wordcloud import WordCloud

# join the words of the different tweets together into one string
long_string = ' '.join(unique_tweets)
new_long_string = ' '.join(set(long_string.split(" ")))

# create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# generate a word cloud
wordcloud.generate(new_long_string)

# visualize the word cloud
wordcloud.to_image()

# Topic Modelling

In [None]:
!pip install -U gensim

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

In [None]:
#NLTK stop words
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
# Do you want to modify this by adding more stop words?
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
# Tokenize words and Clean-up textn-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = tweets_df.text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1])

Remove Stopwords and Lemmatize

The advantage of Lemmatization is that it can reduce the total number of unique words in the dictionary.The ultimate goal of lemmatization is to help the LDA model to produce better topics in the end.

In [None]:
spacy.load("en_core_web_sm")

In [None]:
# Define functions for stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

#Define functions for Lemmatization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = [ ]
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Do lemmatization, keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Create the Dictionary and Corpus needed for Topic Modeling

In [None]:
# create a dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# convert the corpus into a BoW representation
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
import gensim

In [None]:
#use model perplexity and topic coherence to decide the number of topics.
model_list = []
perplexity = []
coherence_values = []

for num_topics in range(2,21,1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                               id2word=id2word,
                               random_state=1,
                               num_topics=num_topics)
    model_list.append(lda_model)
    
#Calculate perplexity
    perplexity_values = lda_model.log_perplexity(corpus)
    print('Perplexity of %d topics is: ' % (num_topics-1), perplexity_values) # a measure of how good the model is. lower the better.
    perplexity.append(perplexity_values)

#Calculate coherence
    coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_values.append(round(coherencemodel.get_coherence(),3))
    print('The Coherence of %d topics is: ' % (num_topics-1), round(coherencemodel.get_coherence(),3))


In [None]:
import matplotlib.pyplot as plt
#Draw Perplexity-Coherence-Topic line chart
#Use the subplot() method to draw multiple graphs
plt.figure(figsize=(16,5),dpi=200)
x = range(2,21,1)

#The drawing board is divided into blocks composed of 2 rows and 1 column, and the first area is obtained
ax1 = plt.subplot(1,2,1)
#Draw in the first subarea
plt.plot(x,perplexity)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity score")
plt.xticks(range(1,21,2))#Set the scale
plt.title('Perplexity')
plt.grid(True, alpha=0.5)

#Select the second sub-area and draw
ax2 = plt.subplot(1,2,2)
plt.plot(x,coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.xticks(range(1,21,2))#Set the scale
plt.title('Coherence')
plt.grid(True, alpha=0.5)

plt.show()



In [None]:
from pprint import pprint

# set number of topics
num_topics = 12

# build an LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)

# print keywords in each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# visualise the topics
!pip install pyldavis

In [None]:
import os
import pyLDAvis.gensim
import pickle 
import pyLDAvis

pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./'+str(num_topics))

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
  pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
  LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './'+ str(num_topics) +'.html')

LDAvis_prepared

# Sentiment Analysis

This implementation is based on the lexicon- and rule-based [VADER](https://github.com/cjhutto/vaderSentiment) sentiment analysis tool.

In [None]:
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

for tweet_text in unique_tweets:
    vs = analyzer.polarity_scores(tweet_text)
    print(tweet_text + '\t' + str(vs))

# Named Entity Recognition


This implementation is based on [spaCy's model](https://spacy.io/models/en#en_core_web_trf) using contextualised embeddings.

In [None]:
!pip install spacy-transformers
!python -m spacy download en_core_web_trf
import spacy
import en_core_web_trf

nlp = spacy.load('en_core_web_trf')


In [None]:
for tweet_text in unique_tweets:
  doc = nlp(tweet_text)
  print(tweet_text)
  for ne in doc.ents:
    print('\tNE found: ', ne.start_char, ne.end_char, ne.label_, tweet_text[ne.start_char:ne.end_char])


# Named Entity Linking

This implementation is based on [spaCy Entity Linker](https://github.com/egerber/spacy-entity-linker).

In [None]:
!pip install spacy-entity-linker
!python -m spacyEntityLinker "download_knowledge_base"

In [None]:
from spacyEntityLinker import EntityLinker
from spacy.language import Language

@Language.factory(
   "entityLinker"
)
def create_linker(nlp, name):
  return EntityLinker()

#add to pipeline
nlp.add_pipe('entityLinker')

In [None]:
for tweet_text in unique_tweets:
  doc = nlp(tweet_text)
  print(tweet_text)
  all_linked_entities = doc._.linkedEntities
  all_linked_entities.pretty_print()
