In [1]:
import sys
print(sys.executable)


C:\Users\KORISNIK\AppData\Local\Programs\Python\Python313\python.exe


In [2]:
!{sys.executable} -m pip install nltk


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2025.7.34-cp313-cp313-win_amd64.whl (275 kB)
Downloading click-8.2.1-py3-none-any.whl (102 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk

   ---------------------------------------- 0/4 [tqdm]
   ---------- ----------------------------- 1/4 [regex]
   -------------------- ------------------- 2/4 [click]
   -------------------- ------------------- 2/4 [click]
   ------------------------------ --------- 3/4 [nltk]
   ------------------------------ --------- 3/4 [nltk]
   --------------------------

In [3]:
import sys
!"{sys.executable}" -m pip install pandas numpy tqdm matplotlib seaborn scikit-learn


Collecting pandas
  Downloading pandas-2.3.1-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp310-cp310-win_amd64.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.3 MB 4.2 MB/s eta 0:00:03
   ----- ---------------------------------- 1.6/11.3 MB 4.4 MB/s eta 0:00:03
   -------- ------------------------------- 2.4/11.3 MB 4.3 MB/s eta 0:00:03
   ----------- ---------------------------- 3.1/11.3 MB 4.7 MB/s eta 0:00:02
   -------------- ------------------------- 4.2/11.3 MB 4.5 MB/s eta 0:00:02
   ------------------ --------------------- 5.2/11.3 MB 4.5 MB/s eta 0:00:02
   -------------------- 

In [4]:
#  00_data_preprocessing.ipynb

# ======================
#  1. LIBRARY IMPORTS
# ======================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import os

# Configure display for better notebook readability
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

# Download NLTK resources
nltk.download('stopwords', quiet=True)  # Silent download
stop_words = set(stopwords.words('english'))

In [5]:
# ======================
#  2. DATA LOADING
# ======================
def load_review_chunks(paths):
    """
    Efficiently loads and concatenates review chunks
    Returns: DataFrame with memory optimization
    """
    review_dfs = []
    for path in paths:
        chunk = pd.read_csv(
            path,
            dtype={'product_id': 'string', 'review_text': 'string'},
            usecols=['product_id', 'review_text', 'rating']
        )
        review_dfs.append(chunk)
    
    return pd.concat(review_dfs, ignore_index=True)

# Load data with error handling
try:
    review_paths = sorted([f"../data/{f}" for f in os.listdir("../data") if f.startswith('reviews_')])
    reviews_df = load_review_chunks(review_paths)
    
    products_df = pd.read_csv(
        "../data/product_info.csv",
        dtype={'product_id': 'string', 'brand_name': 'category'}
    )
    
    print("Data loaded successfully!")
    print(f"Reviews: {len(reviews_df):,} rows | Products: {len(products_df):,} rows")
    
except Exception as e:
    print(f"Error loading data: {str(e)}")
    raise

Data loaded successfully!
Reviews: 1,094,411 rows | Products: 8,494 rows


In [6]:
# ======================
#  3. DATA MERGING
# ======================
def merge_datasets(reviews, products):
    """Merges with memory optimization and column selection"""
    merged = reviews.merge(
        products[['product_id', 'brand_name', 'primary_category', 'price_usd', 'loves_count']],
        on='product_id',
        how='left'
    )
    return merged

merged_df = merge_datasets(reviews_df, products_df)
print(f"\nMerged DataFrame: {merged_df.shape[0]:,} reviews")


Merged DataFrame: 1,094,411 reviews


In [7]:
# ======================
#  4. DATA CLEANING (FIXED VERSION)
# ======================
def clean_data(df):
    """Comprehensive cleaning pipeline with proper DataFrame handling"""
    # Create a clean copy upfront to avoid chained assignment
    df = df.copy()
    
    # Handle missing values - drop rows missing critical fields
    df = df.dropna(subset=['review_text', 'rating']).copy()
    
    # Numerical imputation - use .loc to avoid warnings
    num_cols = ['price_usd', 'loves_count']
    df.loc[:, num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # Categorical imputation - handle category dtype properly
    cat_cols = ['brand_name', 'primary_category']
    for col in cat_cols:
        if pd.api.types.is_categorical_dtype(df[col]):
            # Add 'Unknown' to existing categories first
            df[col] = df[col].cat.add_categories('Unknown')
            df.loc[:, col] = df[col].fillna('Unknown')
        else:
            df.loc[:, col] = df[col].fillna('Unknown')
    
    # Create sentiment labels - filter before assignment
    sentiment_df = df[df['rating'] != 3].copy()
    sentiment_df.loc[:, 'sentiment'] = (sentiment_df['rating'] >= 4).astype(int)
    
    return sentiment_df

cleaned_df = clean_data(merged_df)
print(f"\nAfter cleaning: {cleaned_df.shape[0]:,} reviews remaining")
print("Sentiment distribution:")
print(cleaned_df['sentiment'].value_counts(normalize=True))


After cleaning: 1,011,215 reviews remaining
Sentiment distribution:
sentiment
1    0.887204
0    0.112796
Name: proportion, dtype: float64


  if pd.api.types.is_categorical_dtype(df[col]):


In [8]:
# ======================
#  5. FEATURE ENGINEERING
# ======================
def engineer_features(df):
    """Creates analysis-ready features"""
    # Text features
    df['review_length'] = df['review_text'].str.len()
    df['word_count'] = df['review_text'].str.split().str.len()
    
    # Product metadata features
    df['price_bin'] = pd.qcut(df['price_usd'], q=5, labels=False)
    
    return df

enhanced_df = engineer_features(cleaned_df)


In [10]:
# ======================
#  6. TEXT PREPROCESSING (FIXED VERSION)
# ======================
from tqdm import tqdm  # Import missing library
tqdm.pandas()  # Enable progress_apply

def preprocess_text(text):
    """Robust text cleaning with error handling"""
    if not isinstance(text, str):
        return ""
    
    try:
        # Convert to lowercase
        text = text.lower()
        # Remove HTML tags and non-alphabetic characters
        text = re.sub(r'<.*?>|[^a-z\s]', ' ', text)
        # Remove stopwords and short words
        words = [word for word in text.split() 
                if word not in stop_words and len(word) > 2]
        return ' '.join(words).strip()
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return ""

# Apply preprocessing with progress bar
enhanced_df['clean_text'] = enhanced_df['review_text'].progress_apply(preprocess_text)  # Fixed column name

# ======================
#  FIXED CATEGORICAL WARNING
# ======================
def clean_data(df):
    """Updated to use modern categorical check"""
    df = df.copy()
    
    # Categorical imputation
    cat_cols = ['brand_name', 'primary_category']
    for col in cat_cols:
        if isinstance(df[col].dtype, pd.CategoricalDtype):  # Modern check
            df[col] = df[col].cat.add_categories('Unknown')
            df[col] = df[col].fillna('Unknown')
        else:
            df[col] = df[col].fillna('Unknown')
    
    return df

100%|█████████████████████████████████████████████████████████████████████| 1011215/1011215 [00:15<00:00, 65348.33it/s]


In [13]:
# ======================
# 💾 7. DATA EXPORT (FIXED VERSION)
# ======================
try:
    # Create processed directory if it doesn't exist
    os.makedirs("../data/processed", exist_ok=True)
    
    # Option 1: Try Parquet (faster and more efficient)
    try:
        enhanced_df.to_parquet("../data/processed/full_dataset.parquet", 
                             engine='pyarrow',  # Explicitly specify engine
                             index=False)
        print("✅ Saved full_dataset.parquet using pyarrow")
    except ImportError:
        # Fallback to CSV if Parquet fails
        enhanced_df.to_csv("../data/processed/full_dataset.csv", index=False)
        print("ℹ️ Saved full_dataset.csv (pyarrow not available)")
    
    # Save product catalog (CSV format is fine for this)
    product_catalog = enhanced_df[
        ['product_id', 'brand_name', 'price_usd', 'primary_category']
    ].drop_duplicates()
    
    product_catalog.to_csv("../data/processed/product_catalog.csv", index=False)
    print("✅ Saved product_catalog.csv")
    
except Exception as e:
    print(f"❌ Export failed: {str(e)}")
    print("Please verify:")
    print("1. '../data/processed' directory exists")
    print("2. You have write permissions")

✅ Saved full_dataset.parquet using pyarrow
✅ Saved product_catalog.csv


In [12]:
# ======================
#  FINAL OUTPUT FORMATTING
# ======================
from IPython.display import Markdown

# Display formatted results
Markdown(f"""
## Data Processing Results

### Input Data
- **Product Records Loaded:** {len(products_df):,}
- **Review Records Loaded:** {len(reviews_df):,}
- **Merged Dataset:** {len(merged_df):,} reviews

### Cleaning Results
- **Final Cleaned Reviews:** {len(cleaned_df):,} 
- **Sentiment Distribution:**
  - Positive (4-5 stars): {cleaned_df['sentiment'].mean():.1%}
  - Negative (1-2 stars): {1 - cleaned_df['sentiment'].mean():.1%}

### Output Files
1. `full_dataset.parquet` ({len(enhanced_df):,} records)
2. `product_catalog.csv` ({len(enhanced_df[['product_id', 'brand_name', 'price_usd', 'primary_category']].drop_duplicates()):,} unique products)
""")


## Data Processing Results

### Input Data
- **Product Records Loaded:** 8,494
- **Review Records Loaded:** 1,094,411
- **Merged Dataset:** 1,094,411 reviews

### Cleaning Results
- **Final Cleaned Reviews:** 1,011,215 
- **Sentiment Distribution:**
  - Positive (4-5 stars): 88.7%
  - Negative (1-2 stars): 11.3%

### Output Files
1. `full_dataset.parquet` (1,011,215 records)
2. `product_catalog.csv` (2,347 unique products)
