# Sentiment analysis

**lexicon based**

In [43]:
#we should do carefull preprocessing because we use a lexicon based approach!
def sentimentPreprocessing(text):
    
    #Bring contractions back to their full form
    text = contractions.fix(text)
    
    #Clean the text using the cleantext module
    text=clean(text,
               no_emoji=False, #emojis can be an indication for sentiment
               no_urls=True,
               no_emails=True,
               no_numbers=True,
               no_digits=True,
               no_currency_symbols=True,
               no_punct=True, 
               fix_unicode=True,
               to_ascii=True,
               lower=True,
               normalize_whitespace=True,
               no_line_breaks=True,
               strip_lines=True,
               keep_two_line_breaks=True,
               replace_with_url='',
               replace_with_email='',
               replace_with_phone_number='',
               replace_with_number='',
               replace_with_digit='',
               replace_with_currency_symbol='',
               replace_with_punct='')
    
    
    #Lemmatization to get the morphological root form of the word. (better than stemming because it includes POS tag and we want
    #matches with the lexicon and stemming can give invalid words)
    text = word_tokenize(text)
    pos_tags = pos_tag(text) #Penn-treebank pos tags
    pos_map = {"N": "n", "V": "v","J": "a","R": "r"}
    wn_pos_tags = [(token, pos_map.get(pos[0], "n")) for token, pos in pos_tags] #Wordnet pos tag
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(token, pos) for token, pos in wn_pos_tags]

    #Don't get rid of stopwords because 'not good' would become 'good' for example. Removing words of length 1 would also remove emojis.
    text = [word for word in text if (word not in [query])]
    text = ' '.join(text)
    
    return text

In [44]:
tweets_df['sentimenttext'] = tweets_df["text"]progress_apply(sentimentPreprocessing)

100%|█████████████████████████████████████████████████████████████████████████████| 1026/1026 [00:04<00:00, 211.53it/s]


In [66]:
#derive meaning of an emoji based on the analysis of 70000 tweets and add to the lexicon
#because VADER doesn't include sentiment scores for emojis

#Construction and analysis of Emoji Sentiment Ranking is described in the following paper:
#P. Kralj Novak, J. Smailovic, B. Sluban, I. Mozetic,
#Sentiment of Emojis, PLoS ONE 10(12): e0144296, doi:10.1371/journal.pone.0144296, 2015.

#https://kt.ijs.si/data/Emoji_sentiment_ranking/
sentimentemoji=pd.read_csv("Emoji_Sentiment_Data_v1.0.csv")
#See link for calculations and extra info
sentimentemoji["Sentiment"]=(sentimentemoji["Positive"]-sentimentemoji["Negative"])/sentimentemoji["Occurrences"]

# We should still scale the sentiment column in a range between -3 and 3 like the VADER lexicon
column = sentimentemoji['Sentiment']
scaler = MinMaxScaler(feature_range=(-3, 3))
scaler.fit(column.values.reshape(-1, 1))
sentimentemoji['Sentiment'] = scaler.transform(column.values.reshape(-1, 1))

#Final table only includes emoji and sentiment
sentimentemoji=sentimentemoji[["Emoji", "Sentiment"]]

#Adjust the lexicon based on the table
emojis=list(sentimentemoji["Emoji"])
sentiments=list(sentimentemoji["Sentiment"])
analyzer = SentimentIntensityAnalyzer()
for i in range(0,len(emojis)):
    analyzer.lexicon[emojis[i]]=sentiments[i]

In [67]:
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

In [68]:
tweets_df['lsentiment'] = tweets_df['sentimenttext'].progress_apply(get_sentiment)

100%|████████████████████████████████████████████████████████████████████████████| 1026/1026 [00:00<00:00, 6247.27it/s]


# Topic modelling

**latent dirichlet allocation (LDA)**

In [89]:
def ldaPreprocessing(text):
    
    #Bring contractions back to their full form
    text = contractions.fix(text)
    
    #Clean the text using the cleantext module we remove all junk that seems irrelevant for lda
    text=clean(text,
               no_emoji=True,
               no_urls=True,
               no_emails=True,
               no_numbers=True,
               no_digits=True,
               no_currency_symbols=True,
               no_punct=True, #We can also remove punctuation because it only has little impact on the pos tag performance
               fix_unicode=True,
               to_ascii=True,
               lower=True,
               normalize_whitespace=True,
               no_line_breaks=True,
               strip_lines=True,
               keep_two_line_breaks=True,
               replace_with_url='',
               replace_with_email='',
               replace_with_phone_number='',
               replace_with_number='',
               replace_with_digit='',
               replace_with_currency_symbol='',
               replace_with_punct='')
    
    #Lemmatization to get the morphological root form of the word. Better than stemming because it includes POS tag.
    #This is done because a word is assumed to appear within a certain context regardless of how it is deflected.
    text = word_tokenize(text)
    pos_tags = pos_tag(text) #Penn-treebank pos tags
    pos_map = {"N": "n", "V": "v","J": "a","R": "r"}
    wn_pos_tags = [(token, pos_map.get(pos[0], "n")) for token, pos in pos_tags] #Wordnet pos tag
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(token, pos) for token, pos in wn_pos_tags]
    
    #Get rid of stopwords, the hashtag we scraped on and words that are only one character
    commonverbs=["be","have","do","say","get","make","go","know","take","see","come","think","look","want","give","use","find","tell","ask","work","seem","feel","try","leave","call"]
    text = [word for word in text if (word not in stopwords.words("english")) and (word not in [query]) and (word not in commonverbs) and (len(word)>1)]
    return text

In [None]:
tweets_df['ldatext'] = tweets_df['text'].progress_apply(ldaPreprocessing)

In [78]:
# Add bigrams and trigrams to docs (only ones that appear 10 times or more).
bigram = Phrases(tweets_df["ldatext"], min_count=10)
for idx in range(len(tweets_df["ldatext"])):
    for token in bigram[tweets_df["ldatext"].iloc[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            tweets_df["ldatext"].iloc[idx].append(token)

In [83]:
#variables
num_topics=8
no_below=10
no_above=0.4
no_iter =10
no_passes=10

# Create a dictionary of the words in the tweets
dictionary = corpora.Dictionary(tweets_df['ldatext'])

#This method will remove any token that appear in less than 5 documents or in more than half of the documents
dictionary.filter_extremes(no_below=no_below, no_above=no_above)

# Create a bag-of-words representation of the tweets
bow_corpus = [dictionary.doc2bow(tweet) for tweet in tweets_df['ldatext']]

# Fit an LDA model to the bag-of-words representation
ldamodel = models.LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, iterations=no_iter,passes=no_passes,)

In [84]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(ldamodel, bow_corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [82]:
for topic_id in range(ldamodel.num_topics):
    print(f"Topic #{topic_id+1}:")
    topic_words = ldamodel.show_topic(topic_id, topn=10)
    print([word[0] for word in topic_words])

Topic #1:
['organic', 'snack', 'glutenfree', 'nut', 'dairyfree', 'superfood', 'nutfree', 'tiger', 'nongmo', 'nonallergenic', 'tiger_nut', 'healthyfood', 'dairyfree_nongmo', 'nutfree_glutenfree', 'healthy', 'superfood_healthyfood', 'healthysnacks', 'healthysnacks_organic', 'tigernutssnacks_snack', 'tigernutssnacks', 'breakfast', 'chicken', 'meal', 'healthydiet', 'keto', 'antioxidant', 'plantbased', 'hydration', 'nonallergenic_healthydiet', 'antioxidant_organic']
Topic #2:
['recipe', 'glutenfree', 'via', 'de', 'whole', 'dairyfree', 'keto', 'vegan', 'healthy', 'easy', 'la', 'aipdiet', 'autoimmunepaleo', 'autoimmunepaleo_paleo', 'dinosaur', 'lowfodmap', 'le', 'aipfood', 'aipdiet_aipfood', 'pin', 'recipe_pin', 'aipprotocol', 'food', 'protein', 'salad', 'lowcarb', 'free', 'aipprotocol_autoimmunehealth', 'autoimmunehealth', 'paleodiet']
Topic #3:
['diet', 'keto', 'eat', 'health', 'lowcarb', 'food', 'nutrition', 'vegan', 'healthyeating', 'healthy', 'protein', 'carnivore', 'new', 'low', 'weight

In [145]:
# Infer topic distribution for each tweet
topic_distributions = [ldamodel[bow] for bow in bow_corpus]
for i in range(0,len(topic_distributions)):
    for j in range(0,len(topic_distributions[i])):
        topic_distributions[i][j]=topic_distributions[i][j][1]

df_topic_prob = pd.DataFrame(topic_distributions,columns=[f'topic_{i+1}' for i in range(num_topics)])

tweets_df = tweets_df.reset_index(drop=True)
tweets_df = pd.concat([tweets_df, df_topic_prob], axis=1)