In [3]:
import bz2
import numpy as np
import pandas as pd
import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import bz2
from google.colab import files
from pathlib import Path
import warnings
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
warnings.filterwarnings("ignore")

# Uploading & getting the data

In [2]:
class ReviewsData():

    def __init__(self) -> None:
        pass

    # Uploading data function
    def upload_data(self):
        # Create folder for save
        Path("datasets").mkdir(exist_ok=True)

        # Upload & save files to target directory
        files.upload(target_dir='/content/datasets')

    # Data extraction function
    def get_data(self, file: str) -> pd.DataFrame:
        # Crete list to collect data
        data = list()
        with bz2.open(file, 'rt', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split(' ', 1)  # separate by the first space
                # Check the amount of parts
                if len(parts) == 2:
                    label, text = parts

                    # Get unix time and text without it
                    ts = parts[1].rsplit(' ', 1)[1]
                    text_wo_ts = parts[1].rsplit(' ', 1)[0]

                    # Add the data to list
                    data.append((label, text_wo_ts, ts))

        # Create data frame
        df = pd.DataFrame(
            data,
            columns=['label', 'text', 'timestamp']
        )
        df.label = df.label.str.extract(r'(\d+)').astype(int)  # convert label
        return df

In [5]:
data = ReviewsData()
data.upload_data()

Saving train.ft.txt.bz2 to /content/datasets/train.ft.txt.bz2


In [6]:
# Examine format of our data
file = r'/content/datasets/train.ft.txt.bz2'

a = bz2.BZ2File(file)

b = 0
for i in a:
    x = i.decode(encoding='utf-8').strip()
    b += 1
    print(x)  # We can see label separated by space from the review
    if b == 10:
        break

__label__5 Not great: 1. These are supposed to be 2.5 thick, but they are in fact 34 thick on the thin part, and 1 34 thick at the thickest. This is important because lower frequencies require thicker media to attenuate.br 2. These are delivered collapsed in vacuum packaging. Four out of 12 panels in my trial package have not puffed up after soaking in water and allowing them to dry for 24 hours.br 3. Three of the twelve panels in my pack have cosmetic defects- lines across the face of the panels.br 4. They are accurately cut to 12 square dimensions, so they would likely install easily.br br I cannot recommend these. 1623343987315
__label__3 Routines can be complicated, cuing is lacking: I must be in the minority, but I do feel compelled to share my opinion of this workout anyway. I just did it and I wasnt crazy about it.br br What I did like: the music is good, its not second or third-rate remixes. The instructor is very charming. The production values are good.br br There are seven n

In [7]:
data = ReviewsData()
amazon_rev = data.get_data(r'/content/datasets/train.ft.txt.bz2')

In [8]:
len(amazon_rev)

3040000

In [9]:
amazon_rev.head()

Unnamed: 0,label,text,timestamp
0,5,Not great: 1. These are supposed to be 2.5 thi...,1623343987315
1,3,"Routines can be complicated, cuing is lacking:...",1584312978350
2,5,Five Stars: Great,1507615214308
3,3,These are good but...: The right ear goes out ...,1617653290138
4,1,AnToy: This is a fancy hot plate. With a lid.b...,1542085172678


In [10]:
amazon_rev['timestamp'] = pd.to_datetime(amazon_rev['timestamp'], unit='ms')
amazon_rev.head()

Unnamed: 0,label,text,timestamp
0,5,Not great: 1. These are supposed to be 2.5 thi...,2021-06-10 16:53:07.315
1,3,"Routines can be complicated, cuing is lacking:...",2020-03-15 22:56:18.350
2,5,Five Stars: Great,2017-10-10 06:00:14.308
3,3,These are good but...: The right ear goes out ...,2021-04-05 20:08:10.138
4,1,AnToy: This is a fancy hot plate. With a lid.b...,2018-11-13 04:59:32.678


In [11]:
amazon_rev.to_csv('orig_df.csv')

# Cleaning data

In [2]:
# Create a class for text processing bf tokenization, stemming and lemmatization
class TextProcessing():

    # Download 'stopwords' to handle stopwords
    nltk.download('stopwords')
    stopword = stopwords.words('english')  # add English language

    def __init__(self):
        # Set patterns for re module to handle special symbols
        self.html_pattern = re.compile('<.*?>')
        self.url_pattern = re.compile(r'https?://\S+|www\.\S+')
        self.hashtag_pattern = re.compile(r'\@\w+|\#')
        self.punctuations = string.punctuation

    # Set the lowercase to text
    def to_lowercase(self, text):
        return str(text).lower()

    # Remove html tags
    def remove_html_tags(self, text):
        return self.html_pattern.sub('', text)

    # Remove links
    def remove_url(self, text):
        return self.url_pattern.sub('', text)

    # Remove hashtags
    def remove_hashtags(self, text):
        return self.hashtag_pattern.sub('', text)

    # Remove commas, dots, colons, etc.
    def remove_punctuation(self, text):
        return re.sub(f"[{re.escape(self.punctuations)}]", " ", text)

    # Correct spelling of the words
    def spell_correction(self, text):
        return ''.join(TextBlob(text))

    # Remove emojies
    def remove_emoji(self, text):
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    # Remove stopwords
    def remove_stopwords(self, text):
        new_text = list()

        for i in text.split():
            if i in self.stopword:
                new_text.append('')
            else:
                new_text.append(i)
        x = new_text[:]
        new_text.clear()
        return ' '.join(x)

    def remove_digits(self, text):
        return re.sub(r'\d+', '', text)

    def remove_extra_spaces(self, text):
        return re.sub(r'\s+', ' ', text).strip()

    # Create an aggregating function to use all the functions above
    def cleaning(self, text):
        text = self.to_lowercase(text)
        text = self.remove_html_tags(text)
        text = self.remove_hashtags(text)
        text = self.remove_emoji(text)
        text = self.remove_url(text)
        text = self.remove_stopwords(text)
        text = self.spell_correction(text)
        text = self.remove_extra_spaces(text)
        text = self.remove_digits(text)
        text = self.remove_punctuation(text)
        return text  # return the result text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
cleaning = TextProcessing()

In [4]:
# Clean data using pre-defined class
df = pd.read_csv('orig_df.csv')
data_to_clean = df.copy()
data_to_clean.text = data_to_clean.text.apply(cleaning.cleaning)
data_to_clean.head()

Unnamed: 0.1,Unnamed: 0,label,text,timestamp
0,0,5,great supposed thick fact thick thin pa...,2021-06-10 16:53:07.315
1,1,3,routines complicated cuing lacking must mino...,2020-03-15 22:56:18.350
2,2,5,five stars great,2017-10-10 06:00:14.308
3,3,3,good but right ear goes often put back ca...,2021-04-05 20:08:10.138
4,4,1,antoy fancy hot plate lid br toy,2018-11-13 04:59:32.678


In [5]:
data_to_clean.text.iloc[0]

'great    supposed   thick  fact  thick thin part    thick thickest  important lower frequencies require thicker media attenuate br   delivered collapsed vacuum packaging  four  panels trial package puffed soaking water allowing dry  hours br   three twelve panels pack cosmetic defects  lines across face panels br   accurately cut  square dimensions  would likely install easily br br cannot recommend these '

In [6]:
data_to_clean.to_csv('text_processed_data.csv', index=False)

# Data Processing

In [2]:
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
pre_processing = pd.read_csv('text_processed_data.csv')
pre_processing.drop(columns=['Unnamed: 0'], inplace=True)

In [20]:
pre_processing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040000 entries, 0 to 3039999
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   label      int64 
 1   text       object
 2   timestamp  object
dtypes: int64(1), object(2)
memory usage: 69.6+ MB


In [5]:
pre_processing.timestamp = pd.to_datetime(pre_processing.timestamp, utc=True)

In [27]:
pre_processing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040000 entries, 0 to 3039999
Data columns (total 3 columns):
 #   Column     Dtype              
---  ------     -----              
 0   label      int64              
 1   text       object             
 2   timestamp  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(1), object(1)
memory usage: 69.6+ MB


In [None]:
# Tokenize and lemmatize data
lemma = WordNetLemmatizer()

def process_text(text):
    tokens = word_tokenize(text)
    return ' '.join(lemma.lemmatize(token) for token in tokens)

pre_processing['processed_text'] = pre_processing['text'].apply(lambda x: process_text(x))

In [None]:
pre_processing.processed_text.iloc[0]

'great supposed thick fact thick thin part thick thickest important lower frequency require thicker medium attenuate br delivered collapsed vacuum packaging four panel trial package puffed soaking water allowing dry hour br three twelve panel pack cosmetic defect line across face panel br accurately cut square dimension would likely install easily br br can not recommend these'

In [1]:
pre_processing.to_csv('train_processed.csv')

NameError: name 'pre_processing' is not defined

# Modeling

In [34]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [6]:
# I used low memory PC, that's why took only 100k processed rows
small_data = pre_processing[:100000]
timestamps = small_data.timestamp.dt.date.to_list()
text = small_data.text.to_list()

In [7]:
print(len(timestamps), len(text))

100000 100000


In [8]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

# 2. Embeddings with optimization
encoder = SentenceTransformer(
    "paraphrase-MiniLM-L3-v2",  # lightweight model
    device="cpu"
)

# 3. Cut on batches
batch_size = 1000
embeddings = []
for i in range(0, len(text), batch_size):
    batch = text[i:i+batch_size]
    batch_emb = encoder.encode(
        batch,
        show_progress_bar=True,
        convert_to_numpy=True,
        num_workers=4,  # Use all the CPU cores
        batch_size=64
    )
    embeddings.append(batch_emb)
embeddings = np.vstack(embeddings)

# 4. Оptimize components
umap_model = UMAP(
    n_components=10,  # Reduced dimensionality
    n_neighbors=15,   # Less neighbors
    min_dist=0.05,    # More distance
    metric='cosine',
    low_memory=True
)

hdbscan_model = HDBSCAN(
    min_cluster_size=50,  # Inscrease the size of cluster
    min_samples=10,       # Decrease min_samples
    gen_min_span_tree=False,  # Turn off, low memory
    prediction_data=False
)

# 5. Modeling with optimized settings
topic_model = BERTopic(
    embedding_model=encoder,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    n_gram_range=(1, 1),
    low_memory=True,
    nr_topics=50,
    top_n_words=5,
    calculate_probabilities=False,
    verbose=True
)

# 6. Learning
topics, _ = topic_model.fit_transform(text, embeddings)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-21 10:59:01,220 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-21 11:03:13,593 - BERTopic - Dimensionality - Completed ✓
2025-07-21 11:03:13,601 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-21 11:03:36,701 - BERTopic - Cluster - Completed ✓
2025-07-21 11:03:36,703 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-07-21 11:03:39,949 - BERTopic - Representation - Completed ✓
2025-07-21 11:03:39,953 - BERTopic - Topic reduction - Reducing number of topics
2025-07-21 11:03:40,158 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-21 11:03:43,110 - BERTopic - Representation - Completed ✓
2025-07-21 11:03:43,137 - BERTopic - Topic reduction - Reduced number of topics from 309 to 50


In [24]:
topics_over_time = topic_model.topics_over_time(text, timestamps, datetime_format="%b%M")

5680it [17:51,  5.30it/s]


In [31]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, normalize_frequency=True)

As you can see, from 2000 to 2020 how the Internet changed things on Amazon. Frequency of the reviews grew from lower than 5 to five and higher. In 2015 Amazon launched cloud and streaming services like Amazon Prime, Amazon Video, so you can see that lots of people started leaving the reviews like five-four starts great & product price great good. Most topics through the time is connected with gifts, household goods and repair goods.