# Sentiment Analysis on Media Topics

In [1]:
import pandas as pd
from tqdm import tqdm


## 1. Importing Data

In [2]:
media = pd.read_csv('data/media_concat.csv')
media

Unnamed: 0,MatchDateTime,Station,Snippet
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


## 2. Cleaning Data

### 2.1. Changing Column Names

In [3]:
media.rename(columns = {'MatchDateTime': 'date', 'Station': 'station', 'Snippet': 'text'}, inplace = True)
media

Unnamed: 0,date,station,text
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


### 2.2. Converting Data Type

In [4]:
media['date'] = pd.to_datetime(media['date']).dt.date

In [5]:
media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50068 entries, 0 to 50067
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     50068 non-null  object
 1   station  50068 non-null  object
 2   text     50068 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


### 2.3. Cleaning Texts

In [6]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import re

def text_light_clean(x):
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x) 

    x = re.sub(r'\s{2,}', ' ', x) # Replace any sequence of two or more whitespace characters with a single space.
    x = re.sub(r'\s[^\w\s]\s', '', x) #  Remove any punctuation that is surrounded by whitespace characters.
    
    # deleting stop words    
#     stop_words = set(stopwords.words('english'))
#     x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    
    return x

In [8]:
media['clean_text'] = media.text.apply(text_light_clean)

In [9]:
media

Unnamed: 0,date,station,text,clean_text
0,2017-11-15,BBCNEWS,that is thought to be a danger point. that wou...,that is thought to be a danger point. that wou...
1,2017-11-04,BBCNEWS,with. excess heat is killing people. so we are...,with. excess heat is killing people. so we are...
2,2017-11-15,BBCNEWS,to hold the world's temperature rise as close ...,to hold the world's temperature rise as close ...
3,2017-11-04,BBCNEWS,contains is not news in the sense that this is...,contains is not news in the sense that this is...
4,2017-11-04,BBCNEWS,administration's view on climate change. the s...,administration's view on climate change. the s...
...,...,...,...,...
50063,2017-08-06,BBCNEWS,still meets its climate change targets. italia...,still meets its climate change targets. italia...
50064,2017-08-06,BBCNEWS,to cap energy prices during june's election ca...,to cap energy prices during june's election ca...
50065,2017-08-08,BBCNEWS,climate change? in some ways it does but in so...,climate change? in some ways it does but in so...
50066,2017-08-10,BBCNEWS,getting concerned and we don't know how it wil...,getting concerned and we don't know how it wil...


## 3. Classifying Topic

In [10]:
# from bertopic import BERTopic
# import time

# start_time = time.time()

# doc = media.clean_text.tolist()

# topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

# topics, probs = topic_model.fit_transform(doc)

# end_time = time.time()
# time_taken_seconds = end_time - start_time
# time_taken_minutes = time_taken_seconds / 60
# print(f"Time taken: {time_taken_minutes:.2f} minutes")

In [11]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import time

start_time = time.time()

# we add this to remove stopwords, for lower volumes of data stopwords can cause issues
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

doc = media.clean_text.tolist()

topic_model = BERTopic(
    min_topic_size=30,
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    low_memory=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(doc)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Batches:   0%|          | 0/1565 [00:00<?, ?it/s]

2023-03-07 23:16:49,714 - BERTopic - Transformed documents to Embeddings
2023-03-07 23:17:33,308 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-03-07 23:18:46,287 - BERTopic - Clustered reduced embeddings


Time taken: 12.59 minutes


In [30]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,26393,-1_climate_change_climate change_global
1,0,877,0_shes_nancy_pelosi_nancy pelosi
2,1,856,1_australia_morrison_scott morrison_australias
3,2,772,2_children_school_students_kids
4,3,689,3_fires_wildfires_california_forest
...,...,...,...
166,165,31,165_protect_emissions_protect world_liability
167,166,30,166_theresa_right characterised_characterised_...
168,167,30,167_agreement implement_poland_reached agreeme...
169,168,30,168_reporters new_times melting_reporters_york...


In [12]:
# min_topic_size=30
topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,26393,-1_climate_change_climate change_global
1,0,877,0_shes_nancy_pelosi_nancy pelosi
2,1,856,1_australia_morrison_scott morrison_australias
3,2,772,2_children_school_students_kids
4,3,689,3_fires_wildfires_california_forest
5,4,664,4_paris_agreement_paris climate_accord
6,5,590,5_hes_doesnt_think_said
7,6,561,6_greta_thunberg_greta thunberg_activist
8,7,538,7_weather_record_temperatures_heatwave
9,8,531,8_hurricanes_storms_hurricane_storm


In [31]:
import pickle

# save the model
with open('model/topic_model_media.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

In [14]:
# topic_model.reduce_topics(doc, nr_topics="auto")

In [15]:
# assign topic labels to every text
media['topic'] = topics

# filter out the outliers
media = media[media['topic'] != -1]

## 4. Analyzing Sentiment

### 4.1. Getting Sentiment Scores for Texts

In [16]:
## Using SentimentIntensityAnalyzer

# import nltk
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import pandas as pd

# # Instantiate the sentiment analyzer
# sia = SentimentIntensityAnalyzer()

# # Define a function to apply the sentiment analyzer to a given text
# def get_sentiment_scores(text):
#     return sia.polarity_scores(text)

In [17]:
## Using roberta
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Define a function to apply the sentiment analyzer to a given text
def get_sentiment_scores(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Define weights for negative, neutral, and positive sentiments
    weights = np.array([-1, 0, 1])

    # Compute the compound sentiment score
    compound_score = np.dot(scores, weights)
    
    return compound_score

In [19]:
# Test on a given text
text = "Donald Trump better pray there is no global warming, because he's one easily melted precious snowflake."
get_sentiment_scores(text)

-0.859273349866271

In [20]:
start_time = time.time()

# Apply the sentiment analyzer to each text
media["sentiment"] = media["text"].apply(get_sentiment_scores)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Time taken: 45.66 minutes


In [21]:
media

Unnamed: 0,date,station,text,clean_text,topic,sentiment
8,2017-11-14,BBCNEWS,for countries in the path of devastating hurri...,for countries in the path of devastating hurri...,8,-0.877475
10,2017-11-04,BBCNEWS,"a pretty hefty report, what stands out in it t...","a pretty hefty report, what stands out in it t...",101,0.530395
11,2017-11-13,BBCNEWS,we need to minimise the effect of climate chan...,we need to minimise the effect of climate chan...,89,-0.276567
17,2017-11-07,BBCNEWS,the late hugh van cutsem. he was one of the pr...,the late hugh van cutsem. he was one of the pr...,48,0.121501
20,2017-11-07,BBCNEWS,as we can in that carbon or greenhouse gas emi...,as we can in that carbon or greenhouse gas emi...,26,-0.578437
...,...,...,...,...,...,...
50058,2017-08-20,BBCNEWS,"and i'm thinking, are people going to pay what...","and i'm thinking, are people going to pay what...",45,-0.233798
50059,2017-08-20,BBCNEWS,"the cause itself needed a much betterfilm, and...","the cause itself needed a much betterfilm, and...",45,-0.811317
50060,2017-08-19,BBCNEWS,change was happening. it seemed unnecessary to...,change was happening. it seemed unnecessary to...,55,-0.653779
50061,2017-08-20,BBCNEWS,"i mean, if they did, it's supposed to be a cur...","i mean, if they did, it's supposed to be a cur...",45,-0.132332


In [22]:
media.to_csv('data/media_text_sentiment.csv', index=False)

### 4.2. Getting Sentiment Scores for Topics

In [23]:
# Calculate the average sentiment per topic
topic_sen = media.groupby('topic')['sentiment'].mean().round(2).reset_index()
topic_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
topic_info_sen = pd.merge(topic_model.get_topic_info(), topic_sen, on='Topic')

# Display the resulting dataframe
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,0,877,0_shes_nancy_pelosi_nancy pelosi,-0.16
1,1,856,1_australia_morrison_scott morrison_australias,-0.36
2,2,772,2_children_school_students_kids,-0.21
3,3,689,3_fires_wildfires_california_forest,-0.52
4,4,664,4_paris_agreement_paris climate_accord,-0.23
...,...,...,...,...
165,165,31,165_protect_emissions_protect world_liability,-0.21
166,166,30,166_theresa_right characterised_characterised_...,-0.28
167,167,30,167_agreement implement_poland_reached agreeme...,-0.16
168,168,30,168_reporters new_times melting_reporters_york...,-0.24


In [24]:
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,0,877,0_shes_nancy_pelosi_nancy pelosi,-0.16
1,1,856,1_australia_morrison_scott morrison_australias,-0.36
2,2,772,2_children_school_students_kids,-0.21
3,3,689,3_fires_wildfires_california_forest,-0.52
4,4,664,4_paris_agreement_paris climate_accord,-0.23
...,...,...,...,...
165,165,31,165_protect_emissions_protect world_liability,-0.21
166,166,30,166_theresa_right characterised_characterised_...,-0.28
167,167,30,167_agreement implement_poland_reached agreeme...,-0.16
168,168,30,168_reporters new_times melting_reporters_york...,-0.24


In [25]:
topic_info_sen.to_csv('data/media_topic_sentiment.csv', index=False)

In [26]:
topic_50 = topic_info_sen.sort_values(by='Count', ascending=False).head(50)

In [27]:
topic_50.describe()

Unnamed: 0,Topic,Count,Avg_Sentiment
count,50.0,50.0,50.0
mean,24.5,324.0,-0.196
std,14.57738,190.182082,0.15792
min,0.0,148.0,-0.53
25%,12.25,189.25,-0.3
50%,24.5,259.0,-0.205
75%,36.75,384.25,-0.1025
max,49.0,877.0,0.16


In [28]:
topic_50.sort_values(by='Avg_Sentiment', ascending=False).head(10)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
27,27,233,27_inslee_jay_jay inslee_governor,0.16
22,22,267,22_zero_2050_uk_zero 2050,0.14
43,43,173,43_bbc_bbc news_news_programme,0.11
6,6,561,6_greta_thunberg_greta thunberg_activist,0.05
48,48,152,48_prince_prince charles_charles_duchess,0.01
23,23,266,23_macron_emmanuel_emmanuel macron_french,0.01
28,28,227,28_trees_tree_forest_planting,-0.0
39,39,182,39_pope_francis_pope francis_vatican,-0.01
30,30,223,30_china_chinese_chinas_said china,-0.02
25,25,258,25_biden_joe_joe biden_vice,-0.04


In [29]:
topic_50.sort_values(by='Avg_Sentiment', ascending=True).head(10)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
18,18,288,18_london_arrested_police_rebellion,-0.53
3,3,689,3_fires_wildfires_california_forest,-0.52
13,13,379,13_hoax_change hoax_does president_believe,-0.43
42,42,177,42_cold_tweet_happened global_global warming,-0.41
10,10,435,10_existential_existential threat_threat_crisis,-0.38
35,35,191,35_melting pot_species going_pot impacted_impa...,-0.37
1,1,856,1_australia_morrison_scott morrison_australias,-0.36
21,21,283,21_amazon_brazil_bolsonaro_brazils,-0.36
8,8,531,8_hurricanes_storms_hurricane_storm,-0.35
38,38,185,38_sea_sea levels_rise_rising,-0.32
