In [1]:
# Imports
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from top2vec import Top2Vec

import torch
from tqdm.auto import tqdm
tqdm.pandas()


load datasets separately

In [2]:
business_df = pd.read_csv('../data/business/business.csv')
entertainment_df = pd.read_csv('../data/entertainment/entertainment.csv')
politics_df = pd.read_csv('../data/politics/politics.csv')
sport_df = pd.read_csv('../data/sport/sport.csv')
tech_df = pd.read_csv('../data/tech/tech.csv')

# check head of each df
business_df.head(3)
entertainment_df.head(3)
politics_df.head(3)
sport_df.head(3)
tech_df.head(3)

Unnamed: 0,Category,Text,Filename,Subcategory
0,tech,Ink helps drive democracy in Asia\n\nThe Kyrgy...,data/tech/001.txt,
1,tech,China net cafe culture crackdown\n\nChinese au...,data/tech/002.txt,
2,tech,Microsoft seeking spyware trojan\n\nMicrosoft ...,data/tech/003.txt,


In [3]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

Get ready to clean text data

In [4]:
# define data cleaning function
def clean_text(text):
    # remove extra whitesace
    text = re.sub(r'\s+', ' ', text).strip()
    # lemmatize
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_punct])
# quick test on small sample
original_text = """Ink helps drive democracy in Asia
The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting.
"""
cleaned_text = clean_text(original_text)
print(f"Original text: {original_text}")
print(f"Cleaned text: {cleaned_text}")


Original text: Ink helps drive democracy in Asia
The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting.

Cleaned text: ink help drive democracy in asia the kyrgyz republic a small mountainous state of the former soviet republic be use invisible ink and ultraviolet reader in the country 's election as part of a drive to prevent multiple voting


In [None]:
# apply data cleaning function to each df
business_df["CleanText"] = business_df["Text"].apply(clean_text)
entertainment_df["CleanText"] = entertainment_df["Text"].apply(clean_text)
politics_df["CleanText"] = politics_df["Text"].apply(clean_text)
sport_df["CleanText"] = sport_df["Text"].apply(clean_text)
tech_df["CleanText"] = tech_df["Text"].apply(clean_text)

# check head 
business_df.head(3)