# Sentiment Analysis Test

In [1]:
from bertopic import BERTopic
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/media_clean_comparison.csv')
df

Unnamed: 0,MatchDateTime,Station,Snippet,light_text,heavy_text,lemmatized_text
0,7/7/2017 15:28:59,CNN,"this is a delicate thing, but he wants to look...","this is a delicate thing, but he wants to look...",delicate thing wants look like person world r...,delicate thing want look like person world rea...
1,12/24/2019 21:45:38,BBCNEWS,handicraft master. she is 84 years old and she...,handicraft master. she is years old and she vi...,handicraft master years old visiting niece ma...,handicraft master year old visiting niece make...
2,8/24/2019 23:18:50,MSNBC,france's main objectives as host of this weeke...,france main objectives as host of this weekend...,france main objectives host weekend group seve...,france main objective host weekend group seven...
3,7/6/2019 16:28:05,BBCNEWS,to think of them as robust. but scientists say...,to think of them as robust. but scientists say...,think robust scientists say actually fragile ...,think robust scientist say actually fragile th...
4,3/16/2019 20:13:03,FOXNEWS,[applause] perhaps most importantly to the fut...,[applause] perhaps most importantly to the fut...,applause perhaps importantly future human ra...,applause perhaps importantly future human race...
...,...,...,...,...,...,...
10009,7/8/2017 23:18:02,BBCNEWS,the latest headlines: the 620 summit has ended...,the latest headlines: the summit has ended in ...,latest headlines summit ended germany world ...,latest headline summit ended germany world lea...
10010,7/22/2019 13:38:34,CNN,then i left my business to combat climate chan...,then i left my business to combat climate chan...,left business combat climate change fix democ...,left business combat climate change fix democr...
10011,8/18/2019 23:43:15,BBCNEWS,we have a finite pool of worries. we can only ...,we have a finite pool of worries. we can only ...,finite pool worries worry much people worryi...,finite pool worry worry much people worrying b...
10012,7/17/2019 21:49:54,BBCNEWS,that gives me the shivers. although the kipper...,that gives me the shivers. although the kipper...,gives shivers although kipper unsettles well ...,give shiver although kipper unsettles well rec...


In [3]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Instantiate the sentiment analyzer
sia = SentimentIntensityAnalyzer()


# Define a function to apply the sentiment analyzer to a given text
def get_sentiment_scores(text):
    return sia.polarity_scores(text)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Sentiment Scores for Light-Cleaned text

In [4]:
# Apply the sentiment analyzer to each text in your DataFrame
df["light_sen_scores"] = df["light_text"].apply(get_sentiment_scores)

# Extract the compound score into a separate column in the DataFrame
df["light_sen"] = df["light_sen_scores"].apply(lambda x: x["compound"])

# Drop the original "sentiment_scores" column if you don't need it anymore
df.drop("light_sen_scores", axis=1, inplace=True)

df

Unnamed: 0,MatchDateTime,Station,Snippet,light_text,heavy_text,lemmatized_text,light_sen
0,7/7/2017 15:28:59,CNN,"this is a delicate thing, but he wants to look...","this is a delicate thing, but he wants to look...",delicate thing wants look like person world r...,delicate thing want look like person world rea...,0.8496
1,12/24/2019 21:45:38,BBCNEWS,handicraft master. she is 84 years old and she...,handicraft master. she is years old and she vi...,handicraft master years old visiting niece ma...,handicraft master year old visiting niece make...,-0.3400
2,8/24/2019 23:18:50,MSNBC,france's main objectives as host of this weeke...,france main objectives as host of this weekend...,france main objectives host weekend group seve...,france main objective host weekend group seven...,-0.6808
3,7/6/2019 16:28:05,BBCNEWS,to think of them as robust. but scientists say...,to think of them as robust. but scientists say...,think robust scientists say actually fragile ...,think robust scientist say actually fragile th...,0.3528
4,3/16/2019 20:13:03,FOXNEWS,[applause] perhaps most importantly to the fut...,[applause] perhaps most importantly to the fut...,applause perhaps importantly future human ra...,applause perhaps importantly future human race...,-0.4951
...,...,...,...,...,...,...,...
10009,7/8/2017 23:18:02,BBCNEWS,the latest headlines: the 620 summit has ended...,the latest headlines: the summit has ended in ...,latest headlines summit ended germany world ...,latest headline summit ended germany world lea...,0.0000
10010,7/22/2019 13:38:34,CNN,then i left my business to combat climate chan...,then i left my business to combat climate chan...,left business combat climate change fix democ...,left business combat climate change fix democr...,0.3400
10011,8/18/2019 23:43:15,BBCNEWS,we have a finite pool of worries. we can only ...,we have a finite pool of worries. we can only ...,finite pool worries worry much people worryi...,finite pool worry worry much people worrying b...,-0.9477
10012,7/17/2019 21:49:54,BBCNEWS,that gives me the shivers. although the kipper...,that gives me the shivers. although the kipper...,gives shivers although kipper unsettles well ...,give shiver although kipper unsettles well rec...,0.3382


## Sentiment Scores for Heavy-Cleaned text

In [5]:
# Apply the sentiment analyzer to each text in your DataFrame
df["heavy_sen_scores"] = df["heavy_text"].apply(get_sentiment_scores)

# Extract the compound score into a separate column in the DataFrame
df["heavy_sen"] = df["heavy_sen_scores"].apply(lambda x: x["compound"])

# Drop the original "sentiment_scores" column if you don't need it anymore
df.drop("heavy_sen_scores", axis=1, inplace=True)

df

Unnamed: 0,MatchDateTime,Station,Snippet,light_text,heavy_text,lemmatized_text,light_sen,heavy_sen
0,7/7/2017 15:28:59,CNN,"this is a delicate thing, but he wants to look...","this is a delicate thing, but he wants to look...",delicate thing wants look like person world r...,delicate thing want look like person world rea...,0.8496,0.7425
1,12/24/2019 21:45:38,BBCNEWS,handicraft master. she is 84 years old and she...,handicraft master. she is years old and she vi...,handicraft master years old visiting niece ma...,handicraft master year old visiting niece make...,-0.3400,-0.3400
2,8/24/2019 23:18:50,MSNBC,france's main objectives as host of this weeke...,france main objectives as host of this weekend...,france main objectives host weekend group seve...,france main objective host weekend group seven...,-0.6808,-0.6808
3,7/6/2019 16:28:05,BBCNEWS,to think of them as robust. but scientists say...,to think of them as robust. but scientists say...,think robust scientists say actually fragile ...,think robust scientist say actually fragile th...,0.3528,0.4417
4,3/16/2019 20:13:03,FOXNEWS,[applause] perhaps most importantly to the fut...,[applause] perhaps most importantly to the fut...,applause perhaps importantly future human ra...,applause perhaps importantly future human race...,-0.4951,-0.1779
...,...,...,...,...,...,...,...,...
10009,7/8/2017 23:18:02,BBCNEWS,the latest headlines: the 620 summit has ended...,the latest headlines: the summit has ended in ...,latest headlines summit ended germany world ...,latest headline summit ended germany world lea...,0.0000,0.0000
10010,7/22/2019 13:38:34,CNN,then i left my business to combat climate chan...,then i left my business to combat climate chan...,left business combat climate change fix democ...,left business combat climate change fix democr...,0.3400,0.3400
10011,8/18/2019 23:43:15,BBCNEWS,we have a finite pool of worries. we can only ...,we have a finite pool of worries. we can only ...,finite pool worries worry much people worryi...,finite pool worry worry much people worrying b...,-0.9477,-0.9477
10012,7/17/2019 21:49:54,BBCNEWS,that gives me the shivers. although the kipper...,that gives me the shivers. although the kipper...,gives shivers although kipper unsettles well ...,give shiver although kipper unsettles well rec...,0.3382,-0.0516


## Sentiment Scores for Lemmatized Text

In [6]:
# Apply the sentiment analyzer to each text in your DataFrame
df["lem_sen_scores"] = df["lemmatized_text"].apply(get_sentiment_scores)

# Extract the compound score into a separate column in the DataFrame
df["lem_sen"] = df["lem_sen_scores"].apply(lambda x: x["compound"])

# Drop the original "sentiment_scores" column if you don't need it anymore
df.drop("lem_sen_scores", axis=1, inplace=True)

df

Unnamed: 0,MatchDateTime,Station,Snippet,light_text,heavy_text,lemmatized_text,light_sen,heavy_sen,lem_sen
0,7/7/2017 15:28:59,CNN,"this is a delicate thing, but he wants to look...","this is a delicate thing, but he wants to look...",delicate thing wants look like person world r...,delicate thing want look like person world rea...,0.8496,0.7425,0.7645
1,12/24/2019 21:45:38,BBCNEWS,handicraft master. she is 84 years old and she...,handicraft master. she is years old and she vi...,handicraft master years old visiting niece ma...,handicraft master year old visiting niece make...,-0.3400,-0.3400,-0.3612
2,8/24/2019 23:18:50,MSNBC,france's main objectives as host of this weeke...,france main objectives as host of this weekend...,france main objectives host weekend group seve...,france main objective host weekend group seven...,-0.6808,-0.6808,-0.7579
3,7/6/2019 16:28:05,BBCNEWS,to think of them as robust. but scientists say...,to think of them as robust. but scientists say...,think robust scientists say actually fragile ...,think robust scientist say actually fragile th...,0.3528,0.4417,0.4417
4,3/16/2019 20:13:03,FOXNEWS,[applause] perhaps most importantly to the fut...,[applause] perhaps most importantly to the fut...,applause perhaps importantly future human ra...,applause perhaps importantly future human race...,-0.4951,-0.1779,-0.1779
...,...,...,...,...,...,...,...,...,...
10009,7/8/2017 23:18:02,BBCNEWS,the latest headlines: the 620 summit has ended...,the latest headlines: the summit has ended in ...,latest headlines summit ended germany world ...,latest headline summit ended germany world lea...,0.0000,0.0000,0.0000
10010,7/22/2019 13:38:34,CNN,then i left my business to combat climate chan...,then i left my business to combat climate chan...,left business combat climate change fix democ...,left business combat climate change fix democr...,0.3400,0.3400,0.3400
10011,8/18/2019 23:43:15,BBCNEWS,we have a finite pool of worries. we can only ...,we have a finite pool of worries. we can only ...,finite pool worries worry much people worryi...,finite pool worry worry much people worrying b...,-0.9477,-0.9477,-0.9485
10012,7/17/2019 21:49:54,BBCNEWS,that gives me the shivers. although the kipper...,that gives me the shivers. although the kipper...,gives shivers although kipper unsettles well ...,give shiver although kipper unsettles well rec...,0.3382,-0.0516,-0.0516


In [7]:
df.iloc[10012, 4]

'gives shivers  although kipper unsettles well  recycling  climate change  think anti eu point made  reason behind it '

In [8]:
df.iloc[10012, 3]

'that gives me the shivers. although the kipper unsettles me as well. it recycling, climate change! think there was an anti-eu point being made. a reason behind it.'

In [9]:
df.iloc[10012, 2]

"that gives me the shivers. although the kipper unsettles me as well. it's recycling, climate change!” think there was an anti-eu point being made. a reason behind it."

# Fitting BERTopic

## Light-Cleaned Text

In [10]:
# text = df.light_text.tolist()

In [11]:
# topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", min_topic_size = 15)

In [12]:
# topics, probs = topic_model.fit_transform(text)

In [13]:
# df['light_topic'] = topics

In [14]:
# # Calculate the average sentiment per topic
# light_avg_sen = df.groupby('light_topic')['light_sen'].mean().round(2).reset_index()
# light_avg_sen.columns = ['Topic', 'Avg_Sentiment']

# # Merge the topic counts and average sentiment dataframes
# light_sen_df = pd.merge(topic_model.get_topic_info(), light_avg_sen, on='Topic')

# # Display the resulting dataframe
# light_sen_df

## Heavy-Cleaned Text

In [15]:
text_heavy = df.heavy_text.tolist()

In [16]:
topic_model_heavy = BERTopic(embedding_model="all-MiniLM-L6-v2", min_topic_size = 10)

In [17]:
topics_heavy, probs_heavy = topic_model_heavy.fit_transform(text_heavy)

In [18]:
df['heavy_topic'] = topics_heavy

In [19]:
# Calculate the average sentiment per topic
heavy_avg_sen = df.groupby('heavy_topic')['heavy_sen'].mean().round(2).reset_index()
heavy_avg_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
heavy_sen_df = pd.merge(topic_model_heavy.get_topic_info(), heavy_avg_sen, on='Topic')

# Display the resulting dataframe
heavy_sen_df

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,4739,-1_climate_change_people_global,0.06
1,0,436,0_hoax_president_believe_trump,0.00
2,1,209,1_brexit_europe_labour_party,0.23
3,2,187,2_dioxide_gases_greenhouse_carbon,0.02
4,3,164,3_paris_agreement_accord_withdraw,0.33
...,...,...,...,...
123,122,11,122_laughing_laughter_yourmom_fawkes,0.51
124,123,11,123_terrorism_international_ireland_agreements,-0.40
125,124,11,124_insects_soil_insect_invasive,0.09
126,125,11,125_conditioning_units_hfcs_chemicals,0.02


In [35]:
heavy_sen_df.head(20)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,4739,-1_climate_change_people_global,0.06
1,0,436,0_hoax_president_believe_trump,0.0
2,1,209,1_brexit_europe_labour_party,0.23
3,2,187,2_dioxide_gases_greenhouse_carbon,0.02
4,3,164,3_paris_agreement_accord_withdraw,0.33
5,4,141,4_children_kids_young_grandchildren,0.05
6,5,134,5_meat_food_eat_less,0.13
7,6,133,6_fires_wildfires_fire_california,-0.27
8,7,124,7_ice_arctic_glacier_glaciers,0.13
9,8,118,8_london_rebellion_police_extinction,-0.41


In [31]:
topic_model_heavy.get_representative_docs(1)

['joined  recognise  could could follow  party need change stop happening again  need change  one things aware moment  look threats challenges face us  notjust brexit  brexit  climate change  think working away  solution ',
 'ambitions  cut climate change  talking reducing emissions quickly possible  say key talk mps  labour deputy leader says party must promise another brexit referendum order',
 'know  green deal climate change  powers trade unions  think two questions answer  though  one whether bit london whether labour need somebody outside capital  also brexit  closely linked labour brexit policy  andrew marr']

## Lemmatized Text

In [21]:
text_lem = df.lemmatized_text.tolist()

In [22]:
topic_model_lem = BERTopic(embedding_model="all-MiniLM-L6-v2", min_topic_size = 10)

In [23]:
topics_lem, probs_lem = topic_model_lem.fit_transform(text_lem)

In [24]:
df['lem_topic'] = topics_lem

In [25]:
# Calculate the average sentiment per topic
lem_avg_sen = df.groupby('lem_topic')['lem_sen'].mean().round(2).reset_index()
lem_avg_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
lem_sen_df = pd.merge(topic_model_lem.get_topic_info(), lem_avg_sen, on='Topic')

# Display the resulting dataframe
lem_sen_df

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,4842,-1_climate_change_global_people,0.06
1,0,220,0_paris_agreement_accord_withdraw,0.36
2,1,197,1_brexit_party_labour_election,0.22
3,2,190,2_ice_arctic_glacier_greenland,0.14
4,3,169,3_biden_democratic_candidate_voter,0.27
...,...,...,...,...
122,121,11,121_session_early_meeting_skipped,0.25
123,122,11,122_obama_coal_regulation_championing,0.33
124,123,11,123_woman_cindy_michelle_aside,0.43
125,124,10,124_crime_serious_maguire_downgrade,-0.55


In [32]:
topic_model_lem.get_representative_docs(1)

['honest addressed thing really important mecrime climate change heard tory say anything climate change focus brexit brexit brexit',
 'know green deal climate change power trade union think two question answer though one whether bit london whether labour need somebody outside capital also brexit closely linked labour brexit policy andrew marr',
 'ambition cut climate change talking reducing emission quickly possible say key talk mp labour deputy leader say party must promise another brexit referendum order']

In [34]:
lem_sen_df.head(20)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,4842,-1_climate_change_global_people,0.06
1,0,220,0_paris_agreement_accord_withdraw,0.36
2,1,197,1_brexit_party_labour_election,0.22
3,2,190,2_ice_arctic_glacier_greenland,0.14
4,3,169,3_biden_democratic_candidate_voter,0.27
5,4,130,4_london_rebellion_protest_police,-0.35
6,5,130,5_meat_food_eat_le,0.1
7,6,129,6_president_hoax_question_spicer,0.06
8,7,127,7_child_kid_young_grandchild,0.09
9,8,127,8_china_chinese_hoax_emitter,0.17


## Lemmatized Topic X Original Sentiment

In [27]:
# Calculate the average sentiment per topic
lem_avg_sen = df.groupby('lem_topic')['light_sen'].mean().round(2).reset_index()
lem_avg_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
combined_sen_df = pd.merge(topic_model_lem.get_topic_info(), lem_avg_sen, on='Topic')

# Display the resulting dataframe
combined_sen_df

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,4842,-1_climate_change_global_people,0.04
1,0,220,0_paris_agreement_accord_withdraw,0.34
2,1,197,1_brexit_party_labour_election,0.20
3,2,190,2_ice_arctic_glacier_greenland,0.14
4,3,169,3_biden_democratic_candidate_voter,0.25
...,...,...,...,...
122,121,11,121_session_early_meeting_skipped,0.11
123,122,11,122_obama_coal_regulation_championing,0.34
124,123,11,123_woman_cindy_michelle_aside,0.33
125,124,10,124_crime_serious_maguire_downgrade,-0.47
