In [1]:
#%pip install bertopic

In [2]:
from bertopic import BERTopic
import pandas as pd
from tqdm import tqdm

In [3]:
import os

# Path to the folder containing the CSV files
folder_path = "data/media"

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Create an empty list to store data frames
dfs = []

# Loop through each CSV file and read it into a data frame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all data frames into one
result = pd.concat(dfs, ignore_index=True)

In [4]:
result_slim = result.loc[:, ['MatchDateTime', 'Station', 'Snippet']]

In [5]:
media_20 = result_slim.sample(frac=0.2)

In [6]:
media_20.Snippet

46046    you will spend a lot of 2020 working on the cl...
15713    carbon tax. no, i think there are very good an...
18184    (burke) otherwise known as sand trap scavenger...
44070    responsibilities when it comes to climate chan...
7381     regimes to refuse to welcome a global climate ...
                               ...                        
1293     there is no climate change. if it's hot out, t...
1810     how climate change denial is destroying our po...
21392    towards a more sustainable form of energy, red...
24708    humans have some type of impact on climate cha...
10569    leyland: right after the election when you wou...
Name: Snippet, Length: 10014, dtype: object

# Prepossessing Data

In [7]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

# Define function to preprocess text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    # Remove duplicates
    words = list(set(words))
    # Join words back into a string
    text = ' '.join(words)
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Define function to preprocess text
def preprocess_text_no_token(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Remove duplicates
    words = list(set(words))
    # Join words back into a string
    text = ' '.join(words)
    return text

In [9]:
# Apply preprocess_text_no_token to the DataFrame
media_20['Preprocessed_Snippet'] = media_20['Snippet'].apply(preprocess_text_no_token)

In [10]:
media_20

Unnamed: 0,MatchDateTime,Station,Snippet,Preprocessed_Snippet
46046,12/30/2019 12:09:18,BBCNEWS,you will spend a lot of 2020 working on the cl...,working say like summit slowburn spend financi...
15713,4/23/2019 14:44:48,CNN,"carbon tax. no, i think there are very good an...",answers move quickly ton cap things tax renewa...
18184,1/19/2020 12:48:39,MSNBC,(burke) otherwise known as sand trap scavenger...,weve change known otherwise trap twentyninth s...
44070,9/6/2017 3:51:58,BBCNEWS,responsibilities when it comes to climate chan...,change us cannot world responsibilities techno...
7381,12/10/2018 21:55:52,CNN,regimes to refuse to welcome a global climate ...,change conclusion warns landmark effects refus...
...,...,...,...,...
1293,1/30/2018 0:46:16,MSNBC,"there is no climate change. if it's hot out, t...",know change data deal scientists nan lying sli...
1810,11/24/2018 13:05:40,CNN,how climate change denial is destroying our po...,change man report sitting destroying say denia...
21392,5/30/2017 13:52:26,MSNBC,"towards a more sustainable form of energy, red...",run change form compliance companies footprint...
24708,8/30/2019 4:08:33,MSNBC,humans have some type of impact on climate cha...,change natural type region extent put promoted...


In [11]:
# Apply preprocess_text to the DataFrame
media_20['Tokenized_Snippet'] = media_20['Snippet'].apply(preprocess_text)

In [12]:
media_20

Unnamed: 0,MatchDateTime,Station,Snippet,Preprocessed_Snippet,Tokenized_Snippet
46046,12/30/2019 12:09:18,BBCNEWS,you will spend a lot of 2020 working on the cl...,working say like summit slowburn spend financi...,working say like summit slowburn spend financi...
15713,4/23/2019 14:44:48,CNN,"carbon tax. no, i think there are very good an...",answers move quickly ton cap things tax renewa...,answers move quickly ton cap things tax renewa...
18184,1/19/2020 12:48:39,MSNBC,(burke) otherwise known as sand trap scavenger...,weve change known otherwise trap twentyninth s...,weve change known otherwise trap twentyninth s...
44070,9/6/2017 3:51:58,BBCNEWS,responsibilities when it comes to climate chan...,change us cannot world responsibilities techno...,change us world responsibilities technology co...
7381,12/10/2018 21:55:52,CNN,regimes to refuse to welcome a global climate ...,change conclusion warns landmark effects refus...,change conclusion warns landmark effects refus...
...,...,...,...,...,...
1293,1/30/2018 0:46:16,MSNBC,"there is no climate change. if it's hot out, t...",know change data deal scientists nan lying sli...,know change data deal scientists nan lying sli...
1810,11/24/2018 13:05:40,CNN,how climate change denial is destroying our po...,change man report sitting destroying say denia...,change man report sitting destroying say denia...
21392,5/30/2017 13:52:26,MSNBC,"towards a more sustainable form of energy, red...",run change form compliance companies footprint...,run change form compliance companies footprint...
24708,8/30/2019 4:08:33,MSNBC,humans have some type of impact on climate cha...,change natural type region extent put promoted...,change natural type region extent put promoted...


# Fitting BERTopic

## Original Text

In [13]:
text = media_20.Snippet.tolist()

In [14]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [15]:
topics, probs = topic_model.fit_transform(text)

In [16]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4606,-1_the_and_to_climate
1,0,313,0_paris_agreement_trump_president
2,1,291,1_london_police_protests_protesters
3,2,226,2_weather_temperatures_extreme_record
4,3,219,3_he_him_his_think
...,...,...,...
108,107,11,107_primary_humans_wrong_cause
109,108,11,108_wine_beer_english_england
110,109,11,109_market_moreover_failure_economies
111,110,11,110_filmed_dinner_suspended_field


## Preprocessed Text

In [17]:
text_pre = media_20.Preprocessed_Snippet.tolist()

In [18]:
topic_model_pre = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [19]:
topics_pre, probs_pre = topic_model_pre.fit_transform(text_pre)

In [20]:
topic_model_pre.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4788,-1_climate_change_global_warming
1,0,353,0_paris_agreement_accord_president
2,1,332,1_care_health_gun_immigration
3,2,291,2_london_police_protests_protest
4,3,173,3_children_school_schools_kids
...,...,...,...
108,107,11,107_fracking_shale_underground_ban
109,108,10,108_parties_voters_debacle_lose
110,109,10,109_tillerson_rex_kelly_russia
111,110,10,110_fuels_dioxide_fossil_avoid


In [21]:
topic_model_pre.get_topic(topic=0)

[('paris', 0.05615720044842678),
 ('agreement', 0.03769333367947843),
 ('accord', 0.029603085714951155),
 ('president', 0.017190735459819386),
 ('trump', 0.01658042089513091),
 ('decision', 0.015128219779468556),
 ('states', 0.013183347897855064),
 ('united', 0.012892456219534377),
 ('pull', 0.01288050350263076),
 ('withdraw', 0.01241285834667872)]

## Tokenized Text

In [22]:
text_tok = media_20.Tokenized_Snippet.tolist()

In [23]:
topic_model_tok = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [24]:
topics_tok, probs_tok = topic_model_tok.fit_transform(text_tok)

In [25]:
topic_model_tok.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4946,-1_climate_change_global_warming
1,0,344,0_australia_fires_wildfires_california
2,1,253,1_london_police_protests_protesters
3,2,230,2_care_health_immigration_democrats
4,3,180,3_paris_agreement_accord_decision
...,...,...,...
107,106,11,106_enhanced_commitments_implemented_agreement
108,107,11,107_tucker_carlson_swamped_idiotic
109,108,11,108_tomorrow_august_olivia_7th
110,109,11,109_brexit_labour_services_election
