# Sentiment Analysis on Media Topics

In [1]:
import pandas as pd
from tqdm import tqdm


## 1. Importing Data

In [2]:
media = pd.read_csv('data/media_concat.csv')
media

Unnamed: 0,MatchDateTime,Station,Snippet
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


## 2. Cleaning Data

### 2.1. Changing Column Names

In [3]:
media.rename(columns = {'MatchDateTime': 'date', 'Station': 'station', 'Snippet': 'text'}, inplace = True)
media

Unnamed: 0,date,station,text
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


### 2.2. Converting Data Type

In [4]:
media['date'] = pd.to_datetime(media['date']).dt.date

In [5]:
media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50068 entries, 0 to 50067
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     50068 non-null  object
 1   station  50068 non-null  object
 2   text     50068 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


### 2.3. Cleaning Texts

In [6]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import re

def text_light_clean(x):
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x) 

    x = re.sub(r'\s{2,}', ' ', x) # Replace any sequence of two or more whitespace characters with a single space.
    x = re.sub(r'\s[^\w\s]\s', '', x) #  Remove any punctuation that is surrounded by whitespace characters.
    
    # deleting stop words    
    stop_words = set(stopwords.words('english'))
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    
    return x

In [8]:
media['clean_text'] = media.text.apply(text_light_clean)

In [9]:
media

Unnamed: 0,date,station,text,clean_text
0,2017-11-15,BBCNEWS,that is thought to be a danger point. that wou...,thought danger point. would mean steep cuts gr...
1,2017-11-04,BBCNEWS,with. excess heat is killing people. so we are...,with. excess heat killing people. saying assau...
2,2017-11-15,BBCNEWS,to hold the world's temperature rise as close ...,hold world's temperature rise close possible 1...
3,2017-11-04,BBCNEWS,contains is not news in the sense that this is...,contains news sense consensus view climate sci...
4,2017-11-04,BBCNEWS,administration's view on climate change. the s...,"administration's view climate change. study, c..."
...,...,...,...,...
50063,2017-08-06,BBCNEWS,still meets its climate change targets. italia...,still meets climate change targets. italian po...
50064,2017-08-06,BBCNEWS,to cap energy prices during june's election ca...,"cap energy prices june's election campaign, sh..."
50065,2017-08-08,BBCNEWS,climate change? in some ways it does but in so...,climate change? ways ways redouble efforts pol...
50066,2017-08-10,BBCNEWS,getting concerned and we don't know how it wil...,getting concerned know play impact have. japan...


## 3. Classifying Topic

In [10]:
from bertopic import BERTopic
import time

start_time = time.time()

doc = media.clean_text.tolist()

topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

topics, probs = topic_model.fit_transform(doc)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

topic_model.get_topic_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unnamed: 0,Topic,Count,Name
0,-1,24364,-1_climate_president_change_think
1,0,1222,0_protests_london_rebellion_arrested
2,1,918,1_australia_morrison_bushfires_australian
3,2,443,2_meat_eat_dairy_beef
4,3,406,3_wildfires_california_fires_fire
...,...,...,...
531,530,10,530_publication_rises_drastic_boosters
532,531,10,531_clock_midnight_nuclear_doomsday
533,532,10,532_did_commit_estimating_196
534,533,10,533_pioneering_collected_zaha_overhead


In [11]:
import pickle

# save the model
with open('model/topic_model_media.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

  self._set_arrayXarray(i, j, x)


In [12]:
# topic_model.reduce_topics(doc, nr_topics="auto")

In [13]:
# assign topic labels to every text
media['topic'] = topics

# filter out the outliers
media = media[media['topic'] != -1]

## 4. Analyzing Sentiment

### 4.1. Getting Sentiment Scores for Texts

In [14]:
## Using SentimentIntensityAnalyzer

# import nltk
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import pandas as pd

# # Instantiate the sentiment analyzer
# sia = SentimentIntensityAnalyzer()

# # Define a function to apply the sentiment analyzer to a given text
# def get_sentiment_scores(text):
#     return sia.polarity_scores(text)

In [15]:
## Using roberta
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Define a function to apply the sentiment analyzer to a given text
def get_sentiment_scores(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Define weights for negative, neutral, and positive sentiments
    weights = np.array([-1, 0, 1])

    # Compute the compound sentiment score
    compound_score = np.dot(scores, weights)
    
    return compound_score

In [17]:
# Test on a given text
text = "Donald Trump better pray there is no global warming, because he's one easily melted precious snowflake."
get_sentiment_scores(text)

-0.859273349866271

In [18]:
start_time = time.time()

# Apply the sentiment analyzer to each text
media["sentiment"] = media["text"].apply(get_sentiment_scores)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Time taken: 50.61 minutes


In [26]:
media

Unnamed: 0,date,station,text,clean_text,topic,sentiment
0,2017-11-15,BBCNEWS,that is thought to be a danger point. that wou...,thought danger point. would mean steep cuts gr...,12,-0.762931
2,2017-11-15,BBCNEWS,to hold the world's temperature rise as close ...,hold world's temperature rise close possible 1...,12,-0.590085
4,2017-11-04,BBCNEWS,administration's view on climate change. the s...,"administration's view climate change. study, c...",264,-0.080544
5,2017-11-04,BBCNEWS,white house has attempted to downplay the find...,white house attempted downplay findings report...,264,-0.554912
6,2017-11-04,BBCNEWS,the white house has attempted to downplay the ...,white house attempted downplay findings report...,264,-0.571178
...,...,...,...,...,...,...
50062,2017-08-06,BBCNEWS,"low, while ensuring the uk meets climate chang...","low, ensuring uk meets climate change targets....",184,0.270815
50063,2017-08-06,BBCNEWS,still meets its climate change targets. italia...,still meets climate change targets. italian po...,0,-0.824522
50064,2017-08-06,BBCNEWS,to cap energy prices during june's election ca...,"cap energy prices june's election campaign, sh...",184,-0.079860
50065,2017-08-08,BBCNEWS,climate change? in some ways it does but in so...,climate change? ways ways redouble efforts pol...,335,-0.451891


### 4.2. Getting Sentiment Scores for Topics

In [19]:
# Calculate the average sentiment per topic
topic_sen = media.groupby('topic')['sentiment'].mean().round(2).reset_index()
topic_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
topic_info_sen = pd.merge(topic_model.get_topic_info(), topic_sen, on='Topic')

# Display the resulting dataframe
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,0,1222,0_protests_london_rebellion_arrested,-0.30
1,1,918,1_australia_morrison_bushfires_australian,-0.36
2,2,443,2_meat_eat_dairy_beef,-0.22
3,3,406,3_wildfires_california_fires_fire,-0.53
4,4,378,4_republicans_democrats_republican_party,-0.27
...,...,...,...,...
530,530,10,530_publication_rises_drastic_boosters,-0.67
531,531,10,531_clock_midnight_nuclear_doomsday,-0.45
532,532,10,532_did_commit_estimating_196,-0.12
533,533,10,533_pioneering_collected_zaha_overhead,0.13


In [20]:
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,0,1222,0_protests_london_rebellion_arrested,-0.30
1,1,918,1_australia_morrison_bushfires_australian,-0.36
2,2,443,2_meat_eat_dairy_beef,-0.22
3,3,406,3_wildfires_california_fires_fire,-0.53
4,4,378,4_republicans_democrats_republican_party,-0.27
...,...,...,...,...
530,530,10,530_publication_rises_drastic_boosters,-0.67
531,531,10,531_clock_midnight_nuclear_doomsday,-0.45
532,532,10,532_did_commit_estimating_196,-0.12
533,533,10,533_pioneering_collected_zaha_overhead,0.13


In [21]:
topic_info_sen.to_csv('data/media_topic_sentiment.csv', index=False)

In [22]:
topic_50 = topic_info_sen.sort_values(by='Count', ascending=False).head(50)

In [23]:
topic_50.describe()

Unnamed: 0,Topic,Count,Avg_Sentiment
count,50.0,50.0,50.0
mean,24.5,237.7,-0.1934
std,14.57738,189.682899,0.181351
min,0.0,121.0,-0.53
25%,12.25,154.75,-0.3275
50%,24.5,188.0,-0.215
75%,36.75,223.5,-0.06
max,49.0,1222.0,0.37


In [24]:
topic_50.sort_values(by='Avg_Sentiment', ascending=False).head(10)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
27,27,176,27_keeps_potential_growing_algae,0.37
8,8,300,8_zero_2050_net_target,0.17
10,10,259,10_inslee_jay_governor_candidate,0.15
48,48,127,48_macron_emmanuel_french_france,0.05
41,41,136,41_bbc_news_programme_striving,0.04
38,38,150,38_prince_charles_duchess_derided,0.01
13,13,222,13_trees_tree_planting_forests,-0.01
29,29,171,29_pope_francis_vatican_executives,-0.02
9,9,285,9_poland_conference_talks_200,-0.02
25,25,188,25_greta_thunberg_activist_teenage,-0.03


In [25]:
topic_50.sort_values(by='Avg_Sentiment', ascending=True).head(10)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
3,3,406,3_wildfires_california_fires_fire,-0.53
39,39,146,39_chinese_hoax_perpetrated_created,-0.5
28,28,175,28_hoax_believes_believe_answer,-0.48
15,15,216,15_cold_tweet_coldest_tweeted,-0.41
35,35,158,35_brazil_bolsonaro_amazon_jair,-0.4
21,21,196,21_children_kids_child_grandchildren,-0.39
1,1,918,1_australia_morrison_bushfires_australian,-0.36
26,26,181,26_science_sides_believe_proves,-0.34
49,49,121,49_coal_plants_serbia_mining,-0.33
46,46,129,46_emergency_national_wall_border,-0.33
