In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import glob
import pandas as pd

In [3]:
shared_path = "/content/drive/My Drive/Big Data A3"  # For Shared drives

# Check the folder exists
if not os.path.exists(shared_path):
    raise FileNotFoundError("Couldn't find 'Big Data A3' — check your shared drive path")


files = glob.glob(shared_path + "/*.parquet")
print(files)

['/content/drive/My Drive/Big Data A3/Gift_Cards_Merged.parquet', '/content/drive/My Drive/Big Data A3/Digital_Music_Merged.parquet', '/content/drive/My Drive/Big Data A3/Health_and_Personal_Care_Merged.parquet', '/content/drive/My Drive/Big Data A3/All_Beauty_Merged.parquet', '/content/drive/My Drive/Big Data A3/Amazon_Fashion_Merged.parquet', '/content/drive/My Drive/Big Data A3/Appliances_Merged.parquet', '/content/drive/My Drive/Big Data A3/Cell_Phones_Merged.parquet', '/content/drive/My Drive/Big Data A3/Health_and_Household_Merged.parquet', '/content/drive/My Drive/Big Data A3/Baby_Products_Merged.parquet', '/content/drive/My Drive/Big Data A3/Arts_Crafts_and_Sewing_Merged.parquet', '/content/drive/My Drive/Big Data A3/CDs_and_Vinyl_Merged.parquet', '/content/drive/My Drive/Big Data A3/Kindle_Store_Merged.parquet', '/content/drive/My Drive/Big Data A3/Industrial_and_Scientific_Merged.parquet', '/content/drive/My Drive/Big Data A3/Magazine_Subscriptions.parquet', '/content/drive/M

In [4]:
parquet_paths = [
    '/content/drive/My Drive/Big Data A3/Gift_Cards_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Digital_Music_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Health_and_Personal_Care_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/All_Beauty_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Amazon_Fashion_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Appliances_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Cell_Phones_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Health_and_Household_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Baby_Products_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Arts_Crafts_and_Sewing_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/CDs_and_Vinyl_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Kindle_Store_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Industrial_and_Scientific_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Magazine_Subscriptions.parquet',
    '/content/drive/My Drive/Big Data A3/Musical_Instruments_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Office_Products_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Movies_and_TV_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Patio_Lawn_and_Garden_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Handmade_Products_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Beauty_and_Personal_Care_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Subscription_Boxes_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Grocery_and_Gourmet_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Pet_Supplies_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Software_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Electronics_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Sports_and_Outdoors_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Automotive_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Video_Games_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Tools_and_Home_Improvement_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Toys_and_Games_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Clothes_Shoes_and_Jewelry_Merged.parquet',
    '/content/drive/My Drive/Big Data A3/Home_and_Kitchen_Merged.parquet'
]

In [5]:
import dask.dataframe as dd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
# Load only necessary columns from each parquet
dfs = [dd.read_parquet(path, columns=['text', 'rating']) for path in parquet_paths]

In [7]:
# Combine all into one Dask DataFrame
df = dd.concat(dfs)

In [8]:
# Drop rows with null reviewText or rating
df = df.dropna(subset=['text', 'rating'])

In [9]:
# Convert rating to binary sentiment
df['sentiment'] = df['rating'].map(lambda x: 1 if x > 3 else 0, meta=('sentiment', 'int64'))

In [10]:
# Number of partitions to process at a time
chunk_size = 5

# Create empty list to hold small pandas samples
chunks = []

# Iterate through Dask partitions and collect a sample from each
for i, partition in enumerate(df.to_delayed()):
    if i >= chunk_size:
        break
    chunk_df = partition.compute()
    chunk_sample = chunk_df.sample(frac=0.2, random_state=42)  # sample from chunk
    chunks.append(chunk_sample)

In [11]:
# Combine the smaller pandas chunks
sampled_df = pd.concat(chunks, ignore_index=True)

In [12]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    sampled_df['text'], sampled_df['sentiment'],
    test_size=0.2, random_state=42, stratify=sampled_df['sentiment']
)

In [13]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    min_df=5,
    max_df=0.8,
    token_pattern=r'\b\w+\b'  # word-like tokens
)

In [14]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [15]:
# Train Logistic Regression
clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

In [16]:
# Predictions
y_pred = clf.predict(X_test_vec)

In [18]:
report = classification_report(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print("Classification Report: ")
print(report)
print("Confusion Matrix: ")
print (matrix)

Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.72      0.77     38429
           1       0.90      0.94      0.92    103935

    accuracy                           0.88    142364
   macro avg       0.86      0.83      0.84    142364
weighted avg       0.88      0.88      0.88    142364

Confusion Matrix: 
[[27696 10733]
 [ 6094 97841]]
