In [None]:
import pandas as pd
import pyarrow.parquet as pq
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import numpy as np
import os
from scipy.sparse import save_npz, vstack

try:
  stopwords.words("english")
except LookupError:
  print("words downloading...")
  nltk.download("stopwords")

file_path = "../data/raw/labeled_reviews.parquet"
out_dir = "../data/processed/"

vectorizer_path = os.path.join(out_dir, "tfidf_vectorizer.joblib")
x_tfidf_path = os.path.join(out_dir, "X_tfidf_features.npz")
y_labels_path = os.path.join(out_dir, "y_labels.npy")

stop_words = set(stopwords.words("english"))

def clean_text(text):
  text = text.lower()

  text = re.sub(r'<.*?>', '', text)

  text = re.sub(r'[^a-z\s]', '', text)

  words = text.split()
  words = [word for word in words if word not in stop_words]

  return " ".join(words)

parquet_file = pq.ParquetFile(file_path)
sample = next(parquet_file.iter_batches(batch_size=200000))
df_sample = sample.to_pandas()

df_sample["full text"] = df_sample["summary"].astype(str) + " " + df_sample["review"].astype(str)
df_sample["clean_text"] = df_sample["full text"].apply(clean_text)

tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
tfidf_vectorizer.fit(df_sample["cleaned_text"])

joblib.dump(tfidf_vectorizer, vectorizer_path)

all_tfidf_chunks = []
all_labels = []

batch_iterator = parquet_file.iter_batches(batch_size=100000)

for i, batch in enumerate(batch_iterator):
  df_chunk = batch.to_pandas()

  df_chunk['full_text'] = df_chunk['summary'].astype(str) + ' ' + df_chunk['reviewText'].astype(str)
  df_chunk['cleaned_text'] = df_chunk['full_text'].apply(clean_text)

  X_chunk_tfidf = tfidf_vectorizer.transform(df_chunk['cleaned_text'])

  all_tfidf_chunks.append(X_chunk_tfidf)
  all_labels.extend(df_chunk['class'].values)

X_final_tfidf = vstack(all_tfidf_chunks)

y_final = np.array(all_labels)

save_npz(x_tfidf_path, X_final_tfidf)

np.save(y_labels_path, y_final)

parquet_file = pq.ParquetFile(file_path)
reviewerIDs = pd.read_parquet(file_path, columns=["reviewerID"])
summaries = pd.read_parquet(file_path, columns=["summary"])
overalls = pd.read_parquet(file_path, columns=["overall"])
classes = pd.read_parquet(file_path, columns=["class"])
categories = pd.read_parquet(file_path, columns=["category"])



['reviewerID', 'reviewText', 'summary', 'overall', 'helpful', 'unixReviewTime', 'class', 'category']
              reviewerID
0          A3HVRXV0LVJN7
1         A1BJGDS0L1IO6I
2          A1YX2RBMS1L9L
3         A180NNPPKWCCU0
4         A30P2CYOUYAJM8
...                  ...
25263218  A3SDSFPFY8WS8C
25263219   AVJ4N5LBKAOG5
25263220  A3JB0C3QWIN61Q
25263221   AJ062DSRHW9RX
25263222   AVJ84HKPIGBMH

[25263223 rows x 1 columns]
                                                    summary
0                                                     A++++
1                                           ITEM NOT SENT!!
2                                             Great product
3                                                   Perfect
4                                            Cool purchase.
...                                                     ...
25263218                      Awesome toy Price not Awesome
25263219  Excellent quality and fun to fly. Certainly wo...
25263220                      

In [None]:
columns_to_load = ['reviewText', 'summary', 'overall', 'class', 'category']

df = pd.read_parquet(file_path, columns=columns_to_load)

print("Data loaded successfully with selected columns!")
df.info()

print(df["category"].unique())