# Sentiment Analysis on Media Topics

In [1]:
import pandas as pd
from tqdm import tqdm


## 1. Importing Data

In [2]:
media = pd.read_csv('data/media_concat.csv')
media

Unnamed: 0,MatchDateTime,Station,Snippet
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


## 2. Cleaning Data

### 2.1. Changing Column Names

In [3]:
media.rename(columns = {'MatchDateTime': 'date', 'Station': 'station', 'Snippet': 'text'}, inplace = True)
media

Unnamed: 0,date,station,text
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


### 2.2. Converting Data Type

In [4]:
media['date'] = pd.to_datetime(media['date']).dt.date

In [5]:
media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50068 entries, 0 to 50067
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     50068 non-null  object
 1   station  50068 non-null  object
 2   text     50068 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


### 2.3. Cleaning Texts

In [6]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import re

def text_light_clean(x):
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x) 

    x = re.sub(r'\s{2,}', ' ', x) # Replace any sequence of two or more whitespace characters with a single space.
    x = re.sub(r'\s[^\w\s]\s', '', x) #  Remove any punctuation that is surrounded by whitespace characters.
    
    # deleting stop words    
    stop_words = set(stopwords.words('english'))
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    
    return x

In [8]:
media['clean_text'] = media.text.apply(text_light_clean)

## 3. Classifying Topic

In [9]:
from bertopic import BERTopic
import time

start_time = time.time()

doc = media.clean_text.tolist()

topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

topics, probs = topic_model.fit_transform(doc)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

topic_model.get_topic_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unnamed: 0,Topic,Count,Name
0,-1,24116,-1_climate_change_people_think
1,0,906,0_australia_morrison_bushfires_australian
2,1,708,1_paris_withdraw_agreement_accord
3,2,657,2_meat_food_eat_dairy
4,3,435,3_school_students_schools_strike
...,...,...,...
542,541,10,541_china_craft_apologists_superpower
543,542,10,542_reef_ranger_queensland_barrier
544,543,10,543_sacked_commissioned_inquest_kissi
545,544,10,544_colder_hell_outside_544


In [19]:
import pickle

# save the model
with open('model/topic_model_media.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

  self._set_arrayXarray(i, j, x)


In [10]:
start_time = time.time()

topic_model.reduce_topics(doc, nr_topics="auto")

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Time taken: 0.05 minutes


In [11]:
# assign topic labels to every text
media['topic'] = topics

# filter out the outliers
media = media[media['topic'] != -1]

## 4. Analyzing Sentiment

### 4.1. Getting Sentiment Scores for Texts

In [12]:
## Using SentimentIntensityAnalyzer

# import nltk
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import pandas as pd

# # Instantiate the sentiment analyzer
# sia = SentimentIntensityAnalyzer()

# # Define a function to apply the sentiment analyzer to a given text
# def get_sentiment_scores(text):
#     return sia.polarity_scores(text)

In [13]:
## Using roberta
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# Define a function to apply the sentiment analyzer to a given text
def get_sentiment_scores(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Define weights for negative, neutral, and positive sentiments
    weights = np.array([-1, 0, 1])

    # Compute the compound sentiment score
    compound_score = np.dot(scores, weights)
    
    return compound_score

In [15]:
# Test on a given text
text = "Donald Trump better pray there is no global warming, because he's one easily melted precious snowflake."
get_sentiment_scores(text)

-0.859273349866271

In [16]:
start_time = time.time()

# Apply the sentiment analyzer to each text
media["sentiment"] = media["clean_text"].apply(get_sentiment_scores)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Time taken: 52.41 minutes


### 4.2. Getting Sentiment Scores for Topics

In [17]:
# Calculate the average sentiment per topic
topic_sen = media.groupby('topic')['sentiment'].mean().round(2).reset_index()
topic_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
topic_info_sen = pd.merge(topic_model.get_topic_info(), topic_sen, on='Topic')

# Display the resulting dataframe
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,0,19982,0_climate_change_president_global,-0.34
1,1,657,1_food_meat_eat_less,-0.14
2,2,435,2_school_students_children_protest,-0.22
3,3,353,3_democrats_republicans_republican_party,-0.19
4,4,316,4_greenhouse_atmosphere_dioxide_gases,-0.19
...,...,...,...,...
64,64,13,64_tan_sunscreen_calamitous_50,-0.09
65,65,13,65_parks_park_national_visiting,-0.05
66,66,13,66_measles_meningitis_vaccination_changewhy,-0.33
67,67,11,67_davenport_iowa_unequally_relocate,-0.05


In [18]:
topic_info_sen.head(30)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,0,19982,0_climate_change_president_global,-0.34
1,1,657,1_food_meat_eat_less,-0.14
2,2,435,2_school_students_children_protest,-0.22
3,3,353,3_democrats_republicans_republican_party,-0.19
4,4,316,4_greenhouse_atmosphere_dioxide_gases,-0.19
5,5,306,5_children_kids_child_somebody,-0.36
6,6,285,6_tax_carbon_taxes_dividend,-0.22
7,7,224,7_attenborough_david_sir_bbc,-0.32
8,8,217,8_tucker_carlson_warming_question,0.11
9,9,185,9_cortez_ocasio_alexandria_congresswoman,-0.02
