# Sentiment Analysis on Media Topics

In [7]:
import pandas as pd
from tqdm import tqdm

## 1. Importing Data

In [8]:
media = pd.read_csv('data/media_concat.csv')
media

Unnamed: 0,MatchDateTime,Station,Snippet
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


## 2. Cleaning Data

### 2.1. Changing Column Names

In [9]:
media.rename(columns = {'MatchDateTime': 'date', 'Station': 'station', 'Snippet': 'text'}, inplace = True)
media

Unnamed: 0,date,station,text
0,11/15/2017 13:26:03,BBCNEWS,that is thought to be a danger point. that wou...
1,11/4/2017 22:08:32,BBCNEWS,with. excess heat is killing people. so we are...
2,11/15/2017 21:47:38,BBCNEWS,to hold the world's temperature rise as close ...
3,11/4/2017 12:14:30,BBCNEWS,contains is not news in the sense that this is...
4,11/4/2017 23:12:06,BBCNEWS,administration's view on climate change. the s...
...,...,...,...
50063,8/6/2017 5:29:44,BBCNEWS,still meets its climate change targets. italia...
50064,8/6/2017 21:07:34,BBCNEWS,to cap energy prices during june's election ca...
50065,8/8/2017 20:21:34,BBCNEWS,climate change? in some ways it does but in so...
50066,8/10/2017 4:50:27,BBCNEWS,getting concerned and we don't know how it wil...


### 2.2. Converting Data Type

In [11]:
media['date'] = pd.to_datetime(media['date']).dt.date

In [13]:
media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50068 entries, 0 to 50067
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     50068 non-null  object
 1   station  50068 non-null  object
 2   text     50068 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


### 2.3. Cleaning Texts

In [17]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
import re

def text_light_clean(x):
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x) 

    x = re.sub(r'\s{2,}', ' ', x) # Replace any sequence of two or more whitespace characters with a single space.
    x = re.sub(r'\s[^\w\s]\s', '', x) #  Remove any punctuation that is surrounded by whitespace characters.
    
    # deleting stop words    
    stop_words = set(stopwords.words('english'))
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    
    return x

In [19]:
media['clean_text'] = media.text.apply(text_light_clean)

## 3. Classifying Topic

In [21]:
from bertopic import BERTopic

doc = media.clean_text.tolist()

topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

topics, probs = topic_model.fit_transform(doc)

topic_model.get_topic_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unnamed: 0,Topic,Count,Name
0,-1,23846,-1_climate_president_change_people
1,0,1099,0_brexit_labour_party_eu
2,1,916,1_australia_morrison_bushfires_australian
3,2,615,2_meat_food_eat_dairy
4,3,602,3_paris_withdraw_accord_agreement
...,...,...,...
522,521,10,521_kit_monumental_dear_caution
523,522,10,522_oxygen_ocean_reoxygenate_irony
524,523,10,523_dominated_was_2019_theoretical
525,524,10,524_economic_reframe_unmitigated_counterargument


## 4. Analyzing Sentiment

### 4.1. Getting Sentiment Scores for Texts

In [23]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Instantiate the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to apply the sentiment analyzer to a given text
def get_sentiment_scores(text):
    return sia.polarity_scores(text)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [24]:
# Apply the sentiment analyzer to each text in your DataFrame
media["sen_scores"] = media["clean_text"].apply(get_sentiment_scores)

# Extract the compound score into a separate column in the DataFrame
media["sentiment"] = media["sen_scores"].apply(lambda x: x["compound"])

# Drop the original "sentiment_scores" column if you don't need it anymore
media.drop("sen_scores", axis=1, inplace=True)

media

Unnamed: 0,date,station,text,clean_text,sentiment
0,2017-11-15,BBCNEWS,that is thought to be a danger point. that wou...,thought danger point. would mean steep cuts gr...,-0.6808
1,2017-11-04,BBCNEWS,with. excess heat is killing people. so we are...,with. excess heat killing people. saying assau...,-0.8481
2,2017-11-15,BBCNEWS,to hold the world's temperature rise as close ...,hold world's temperature rise close possible 1...,-0.6808
3,2017-11-04,BBCNEWS,contains is not news in the sense that this is...,contains news sense consensus view climate sci...,0.4404
4,2017-11-04,BBCNEWS,administration's view on climate change. the s...,"administration's view climate change. study, c...",0.1531
...,...,...,...,...,...
50063,2017-08-06,BBCNEWS,still meets its climate change targets. italia...,still meets climate change targets. italian po...,-0.8834
50064,2017-08-06,BBCNEWS,to cap energy prices during june's election ca...,"cap energy prices june's election campaign, sh...",0.9042
50065,2017-08-08,BBCNEWS,climate change? in some ways it does but in so...,climate change? ways ways redouble efforts pol...,-0.2960
50066,2017-08-10,BBCNEWS,getting concerned and we don't know how it wil...,getting concerned know play impact have. japan...,0.3400


### 4.2. Getting Sentiment Scores for Topics

In [25]:
# assign topic labels to every text
media['topic'] = topics

# Calculate the average sentiment per topic
topic_sen = media.groupby('topic')['sentiment'].mean().round(2).reset_index()
topic_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
topic_info_sen = pd.merge(topic_model.get_topic_info(), topic_sen, on='Topic')

# Display the resulting dataframe
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,23846,-1_climate_president_change_people,0.08
1,0,1099,0_brexit_labour_party_eu,0.15
2,1,916,1_australia_morrison_bushfires_australian,-0.14
3,2,615,2_meat_food_eat_dairy,0.12
4,3,602,3_paris_withdraw_accord_agreement,0.31
...,...,...,...,...
522,521,10,521_kit_monumental_dear_caution,0.11
523,522,10,522_oxygen_ocean_reoxygenate_irony,-0.31
524,523,10,523_dominated_was_2019_theoretical,0.33
525,524,10,524_economic_reframe_unmitigated_counterargument,0.38


In [26]:
topic_info_sen.head(30)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,23846,-1_climate_president_change_people,0.08
1,0,1099,0_brexit_labour_party_eu,0.15
2,1,916,1_australia_morrison_bushfires_australian,-0.14
3,2,615,2_meat_food_eat_dairy,0.12
4,3,602,3_paris_withdraw_accord_agreement,0.31
5,4,419,4_hurricanes_storms_hurricane_storm,0.11
6,5,398,5_zero_2050_net_target,0.11
7,6,384,6_london_rebellion_extinction_arrested,-0.41
8,7,325,7_poland_conference_talks_200,0.08
9,8,305,8_republicans_democrats_republican_party,0.14
