# Sentiment Analysis on NYT Topics

## 1. Importing Data

In [1]:
import pandas as pd
import os

# Path to the folder containing the CSV files
folder_path = "data/nyt"

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Create an empty list to store data frames
dfs = []

# Loop through each CSV file and read it into a data frame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all data frames into one
result = pd.concat(dfs, ignore_index=True)

In [2]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16478 entries, 0 to 16477
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    16478 non-null  object
 1   snippet  16478 non-null  object
 2   date     16478 non-null  object
 3   url      16478 non-null  object
dtypes: object(4)
memory usage: 515.1+ KB


In [3]:
result

Unnamed: 0,title,snippet,date,url
0,Feminism Lost. Now What?,“It’s amazing to me the lightning speed at whi...,2016-12-30,https://www.nytimes.com/2016/12/30/opinion/sun...
1,"Fish Seek Cooler Waters, Leaving Some Fisherme...",Catch limits for fishermen are often based on ...,2016-12-30,https://www.nytimes.com/2016/12/30/science/fis...
2,"‘Church Militant’ Theology Is Put to New, and ...",Its use by a right-wing website and others has...,2016-12-30,https://www.nytimes.com/2016/12/30/us/church-m...
3,"In Banning Ivory Trade, China Saw Benefits for...",The decision to bring the world’s largest ivor...,2017-01-02,https://www.nytimes.com/2017/01/02/world/asia/...
4,Can Carbon Capture Technology Prosper Under Tr...,"Progress has come in fits and starts, but supp...",2017-01-02,https://www.nytimes.com/2017/01/02/science/don...
...,...,...,...,...
16473,The Coronavirus and Carbon Emissions,"Also this week, a contentious idea: tweaking o...",2020-02-26,https://www.nytimes.com/2020/02/26/climate/nyt...
16474,Laurene Powell Jobs Is Putting Her Own Dent in...,An interview with the 35th-richest person in t...,2020-02-27,https://www.nytimes.com/2020/02/27/business/la...
16475,U.K. Court Blocks Heathrow Airport Expansion o...,The Court of Appeal said the government failed...,2020-02-27,https://www.nytimes.com/2020/02/27/world/europ...
16476,Which States Are Doing the Most Sustainable Bu...,Here’s a look at the leading states for LEED c...,2020-02-27,https://www.nytimes.com/2020/02/27/realestate/...


In [4]:
nyt = result[['date', 'url', 'title', 'snippet']]

In [5]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16478 entries, 0 to 16477
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     16478 non-null  object
 1   url      16478 non-null  object
 2   title    16478 non-null  object
 3   snippet  16478 non-null  object
dtypes: object(4)
memory usage: 515.1+ KB


## 2. Cleaning Data

### 2.2. Converting Data Type

In [6]:
nyt = nyt.drop_duplicates()

In [7]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13214 entries, 0 to 16471
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     13214 non-null  object
 1   url      13214 non-null  object
 2   title    13214 non-null  object
 3   snippet  13214 non-null  object
dtypes: object(4)
memory usage: 516.2+ KB


In [None]:
nyt['date'] = pd.to_datetime(nyt['date']).dt.date

In [8]:
# retain only rows where date is between 2017-01-01 and 2020-01-31
mask = (nyt['date'] >= '2017-01-01') & (nyt['date'] <= '2020-01-31')
nyt = nyt.loc[mask]

In [9]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12844 entries, 3 to 16424
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     12844 non-null  object
 1   url      12844 non-null  object
 2   title    12844 non-null  object
 3   snippet  12844 non-null  object
dtypes: object(4)
memory usage: 501.7+ KB


### 2.3. Cleaning Texts

In [10]:
import re

def text_light_clean(x):
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x) 

    x = re.sub(r'\s{2,}', ' ', x) # Replace any sequence of two or more whitespace characters with a single space.
    x = re.sub(r'\s[^\w\s]\s', '', x) #  Remove any punctuation that is surrounded by whitespace characters.
    
    return x

In [11]:
nyt['text'] = nyt['title'] + ' ' + nyt['snippet']

In [12]:
nyt['clean_text'] = nyt.text.apply(text_light_clean)

In [30]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12844 entries, 3 to 16424
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        12844 non-null  object
 1   url         12844 non-null  object
 2   title       12844 non-null  object
 3   snippet     12844 non-null  object
 4   text        12844 non-null  object
 5   clean_text  12844 non-null  object
dtypes: object(6)
memory usage: 702.4+ KB


## 3. Classifying Topic

In [54]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import time

start_time = time.time()

# we add this to remove stopwords, for lower volumes of data stopwords can cause issues
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

doc = nyt.clean_text.tolist()

topic_model = BERTopic(
    min_topic_size = 15,
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    low_memory=True,
    # automatically reduce number of topiccs
#     nr_topics="auto",
    verbose=True
)

topics, probs = topic_model.fit_transform(doc)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Batches:   0%|          | 0/402 [00:00<?, ?it/s]

2023-03-08 15:03:24,069 - BERTopic - Transformed documents to Embeddings
2023-03-08 15:03:31,374 - BERTopic - Reduced dimensionality
2023-03-08 15:03:39,373 - BERTopic - Clustered reduced embeddings


Time taken: 2.71 minutes


In [53]:
# min_topic_size=15, reduced topics
# topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,4942,-1_new_trump_climate_president
1,0,790,0_trump_pollution_emissions_air
2,1,770,1_yankees_game_league_team
3,2,693,2_art_fashion_museum_new
4,3,453,3_climate_climate change_change_paris
5,4,369,4_hurricane_puerto_storm_rico
6,5,298,5_trade_china_xi_tariffs
7,6,283,6_afghan_saudi_syria_afghanistan
8,7,229,7_reading_books_contest_book
9,8,189,8_facebook_social_tech_google


In [36]:
# min_topic_size=10, reduced topics
# topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,5344,-1_new_trump_climate_president
1,0,1921,0_trump_climate_president_change
2,1,633,1_yankees_game_mets_injury
3,2,471,2_species_scientists_birds_coral
4,3,290,3_art_museum_artists_film
5,4,263,4_hurricane_storm_harvey_flooding
6,5,222,5_china_trade_xi_chinas
7,6,208,6_restaurant_food_chef_meat
8,7,204,7_fed_tax_rates_budget
9,8,198,8_facebook_tech_silicon_silicon valley


In [51]:
# min_topic_size=30, reduced topics
# topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,4718,-1_new_trump_climate_change
1,0,5936,0_trump_new_climate_president
2,1,550,1_fashion_art_museum_artists
3,2,144,2_restaurant_chef_food_cook
4,3,116,3_open_williams_tennis_federer
5,4,100,4_places_52_52 places_traveler
6,5,98,5_epa_pruitt_scott_scott pruitt
7,6,88,6_puerto_puerto rico_rico_maria
8,7,87,7_couple_met_couple met_groom
9,8,80,8_health_care_health care_medicaid


In [28]:
# min_topic_size=20, reduced topics
# topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,5053,-1_new_trump_president_climate
1,0,974,0_democrats_democratic_trump_president
2,1,791,1_yankees_game_mets_league
3,2,436,2_hurricane_storm_puerto_rico
4,3,318,3_climate_climate change_change_science
5,4,256,4_art_museum_artists_film
6,5,248,5_afghan_saudi_syria_iran
7,6,203,6_fashion_designer_fashion week_mens
8,7,174,7_police_man_officers_officer
9,8,174,8_ice_heat_temperatures_arctic


In [56]:
# min_topic_size=15
topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,5275,-1_trump_new_president_climate
1,0,228,0_fashion_designer_wear_fashion week
2,1,217,1_reads_books_reading_reading great
3,2,217,2_hurricane_storm_harvey_hurricanes
4,3,178,3_yankees_mets_injury_game
...,...,...,...
135,134,16,134_ceos_councils_business leaders_advisory
136,135,16,135_solar_power_solar power_solar panels
137,136,16,136_buttigieg_pete_pete buttigieg_mr buttigieg
138,137,16,137_powell_yellen_fed_jerome powell


In [41]:
# min_topic_size=20
# topic_model.get_topic_info().head(30)

Unnamed: 0,Topic,Count,Name
0,-1,4807,-1_new_trump_climate_president
1,0,416,0_democratic_democrats_2020_biden
2,1,394,1_species_scientists_birds_coral
3,2,213,2_china_trade_xi_chinas
4,3,209,3_fashion_designer_fashion week_wear
5,4,208,4_hurricane_storm_harvey_storms
6,5,196,5_police_man_officers_officer
7,6,179,6_facebook_tech_google_silicon
8,7,176,7_yankees_mets_injury_game
9,8,168,8_art_museum_artists_museums


In [17]:
# min_topic_size=10
# topic_model.get_topic_info().head(50)

Unnamed: 0,Topic,Count,Name
0,-1,4836,-1_trump_new_climate_change
1,0,222,0_fashion_designer_wear_fashion week
2,1,217,1_art_museum_artists_museums
3,2,176,2_yankees_mets_injury_game
4,3,146,3_reading_reading great_great reads_reads web
5,4,136,4_democrats_governor_orourke_beto
6,5,133,5_macron_france_merkel_emmanuel
7,6,118,6_open_williams_tennis_federer
8,7,106,7_warriors_nba_knicks_golden state
9,8,103,8_police_man_officers_officer


In [61]:
import pickle

# save the model
with open('model/topic_model_nyt.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

  self._set_arrayXarray(i, j, x)


In [None]:
# topic_model.reduce_topics(doc, nr_topics="auto")

In [57]:
# assign topic labels to every text
nyt['topic'] = topics

# filter out the outliers
# nyt = nyt[nyt['topic'] != -1]

## 4. Analyzing Sentiment

### 4.1. Getting Sentiment Scores for Texts

In [31]:
## Using roberta
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
# Define a function to apply the sentiment analyzer to a given text
def get_sentiment_scores(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Define weights for negative, neutral, and positive sentiments
    weights = np.array([-1, 0, 1])

    # Compute the compound sentiment score
    compound_score = np.dot(scores, weights)
    
    return compound_score

In [33]:
# Test on a given text
text = "Donald Trump better pray there is no global warming, because he's one easily melted precious snowflake."
get_sentiment_scores(text)

-0.859273349866271

In [34]:
from tqdm import tqdm

tqdm.pandas()

start_time = time.time()

# Apply the sentiment analyzer to each text
nyt["sentiment"] = nyt["text"].progress_apply(get_sentiment_scores)

end_time = time.time()
time_taken_seconds = end_time - start_time
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

100%|█████████████████████████████████████| 12844/12844 [24:47<00:00,  8.63it/s]

Time taken: 24.79 minutes





In [35]:
nyt.to_csv('data/nyt_text_sentiment.csv', index=False)

### 4.2. Getting Sentiment Scores for Topics

In [58]:
# Calculate the average sentiment per topic
topic_sen = nyt.groupby('topic')['sentiment'].mean().round(2).reset_index()
topic_sen.columns = ['Topic', 'Avg_Sentiment']

# Merge the topic counts and average sentiment dataframes
topic_info_sen = pd.merge(topic_model.get_topic_info(), topic_sen, on='Topic')

In [59]:
topic_info_sen

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,5275,-1_trump_new_president_climate,-0.09
1,0,228,0_fashion_designer_wear_fashion week,0.29
2,1,217,1_reads_books_reading_reading great,0.27
3,2,217,2_hurricane_storm_harvey_hurricanes,-0.32
4,3,178,3_yankees_mets_injury_game,0.05
...,...,...,...,...
135,134,16,134_ceos_councils_business leaders_advisory,-0.32
136,135,16,135_solar_power_solar power_solar panels,0.17
137,136,16,136_buttigieg_pete_pete buttigieg_mr buttigieg,0.10
138,137,16,137_powell_yellen_fed_jerome powell,0.14


In [60]:
topic_info_sen.to_csv('data/nyt_topic_sentiment.csv', index=False)

In [49]:
topic_info_sen.head(30)

Unnamed: 0,Topic,Count,Name,Avg_Sentiment
0,-1,5066,-1_new_trump_climate_change,-0.09
1,0,794,0_trump_china_president_trade,-0.22
2,1,770,1_art_fashion_new_songs,0.23
3,2,707,2_yankees_game_mets_league,0.08
4,3,397,3_hurricane_storm_flooding_harvey,-0.43
5,4,302,4_climate_climate change_change_warming,-0.19
6,5,238,5_afghan_afghanistan_taliban_syria,-0.42
7,6,204,6_reading_reads_books_contest,0.38
8,7,161,7_wildfires_fires_california_wildfire,-0.52
9,8,155,8_minister_prime_prime minister_brexit,-0.21
