<a href="https://colab.research.google.com/github/ANJU-9676/pythonfiless/blob/main/sentiment_analysis_for_product_reviews_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
preprocess.py
- Functions to load raw CSV, map ratings -> labels (if needed), clean text,
  and save an optional processed CSV ('data/processed_reviews.csv').

Usage:
>>> from preprocess import load_and_preprocess
>>> df = load_and_preprocess("data/reviews.csv", save_processed=True)
"""

import re
import os
import pandas as pd
import nltk

from nltk.corpus import stopwords

# ensure nltk resources
try:
    _ = stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

STOPWORDS = set(stopwords.words("english"))

DEFAULT_LABELS = ("negative", "neutral", "positive")


def map_rating_to_label(r):
    """Map numeric rating to sentiment label (customize thresholds here)."""
    try:
        r = float(r)
    except Exception:
        return None
    if r <= 2.0:
        return "negative"
    if 2.0 < r < 4.0:
        return "neutral"
    return "positive"


def clean_text(s, keep_exclamation_question=True, keep_emojis=False):
    """
    Basic cleaning:
    - lowercasing
    - remove urls
    - remove punctuation (optionally keep ! and ?)
    - remove extra whitespace
    - remove stopwords
    - handle simple negation by joining 'not' with following token (optional)
    """
    if not isinstance(s, str):
        return ""
    s = s.lower()
    # remove urls
    s = re.sub(r"http\S+|www\S+|https\S+", " ", s)
    if not keep_emojis:
        # remove non-ascii emoji-like characters
        s = re.sub(r"[^\x00-\x7F]+", " ", s)
    # optionally preserve ! and ?
    if keep_exclamation_question:
        s = re.sub(r"[^a-z0-9\s\!\?']", " ", s)
    else:
        s = re.sub(r"[^a-z0-9\s']", " ", s)
    # simple whitespace cleanup
    tokens = s.split()
    # negation handling: attach 'not' to next token -> 'not_good'
    out_tokens = []
    neg_next = False
    for t in tokens:
        if neg_next:
            out_tokens.append("not_" + t)
            neg_next = False
            continue
        if t in ("not", "no", "never", "n't"):
            neg_next = True
            continue
        if t in STOPWORDS:
            continue
        out_tokens.append(t)
    return " ".join(out_tokens)


def load_and_preprocess(path="data/reviews.csv", review_col="review", label_col=None,
                        rating_col="rating", save_processed=False, processed_path="data/processed_reviews.csv"):
    """
    Load CSV, map rating->label if necessary, clean reviews, drop empty rows.
    Returns DataFrame with columns: 'review' (raw), 'clean_review', 'label'
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} not found. Place your CSV at this path.")

    df = pd.read_csv(path)
    # Try to infer label column if not provided
    if label_col and label_col in df.columns:
        df["label"] = df[label_col]
    elif "label" in df.columns:
        df["label"] = df["label"]
    elif rating_col in df.columns:
        df["label"] = df[rating_col].apply(map_rating_to_label)
    else:
        # no label/rating provided; user must label externally
        df["label"] = None

    if review_col not in df.columns:
        # try common alternatives
        candidates = [c for c in df.columns if "review" in c.lower() or "text" in c.lower()]
        if not candidates:
            raise ValueError("No review/text column found. Provide 'review' column or specify review_col.")
        review_col = candidates[0]

    df = df[[review_col, "label"]].rename(columns={review_col: "review"})
    df["review"] = df["review"].astype(str)
    df["clean_review"] = df["review"].apply(clean_text)

    # remove rows with no label if labels are required for training
    # Caller can decide; here we keep rows (user may want to predict later)
    if save_processed:
        os.makedirs(os.path.dirname(processed_path) or ".", exist_ok=True)
        df.to_csv(processed_path, index=False)

    return df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Colab-ready helper: upload/mount CSV and run train.py
# Paste this into a Colab cell and run.

import os
import shutil
import subprocess
from pathlib import Path

DATA_LOCAL = "data/product_reviews.csv"
SRC_TRAIN_MODULE = "src.train"   # we will call python -m src.train --data <path>
ALTERNATE_DRIVE_PATH = "/content/drive/MyDrive/product_reviews.csv"  # example if you put file in Drive

# ensure data dir exists
os.makedirs("data", exist_ok=True)

def try_run_training(data_path):
    print(f"\n==> Running training with data: {data_path}\n")
    # Use -m to avoid import problems when running from project root
    cmd = ["python", "-m", "src.train", "--data", data_path]
    # show command for debugging
    print("Command:", " ".join(cmd))
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    # stream output
    for line in proc.stdout:
        print(line, end="")
    proc.wait()
    print(f"\nProcess exited with code {proc.returncode}")
    return proc.returncode

# 1) If file already exists in data/, run training directly
if os.path.exists(DATA_LOCAL):
    print(f"Found CSV at {DATA_LOCAL}.")
    try_run_training(DATA_LOCAL)
else:
    # 2) If you previously mounted Drive and put the file there, try that
    if os.path.exists(ALTERNATE_DRIVE_PATH):
        print(f"Found CSV in Drive at {ALTERNATE_DRIVE_PATH}. Copying to data/ and running.")
        shutil.copy(ALTERNATE_DRIVE_PATH, DATA_LOCAL)
        try_run_training(DATA_LOCAL)
    else:
        # 3) Prompt to upload the file manually from your Windows PC
        print("""
CSV not found in data/ and not found in the example Drive path.
You can either:
  A) Upload the CSV now from your computer (recommended for a single file), or
  B) Mount Google Drive and place the CSV in Drive, then re-run this cell.

Choose Upload (A) by running the upload block below when prompted.
""")
        # Interactive upload
        from google.colab import files
        uploaded = files.upload()  # this opens a file chooser in Colab
        if not uploaded:
            raise SystemExit("No files uploaded. Upload product_reviews.csv and re-run the cell.")

        # Move the first uploaded csv (or the one named product_reviews.csv) into data/
        moved = False
        for fn in uploaded.keys():
            # if the uploaded file is the exact CSV we expect, move it to data/
            if fn.lower().endswith(".csv"):
                dest = DATA_LOCAL
                print(f"Moving uploaded file {fn} -> {dest}")
                shutil.move(fn, dest)
                moved = True
                break

        if not moved:
            # nothing matched .csv (unlikely), pick the first file
            first = list(uploaded.keys())[0]
            dest = DATA_LOCAL
            print(f"No .csv detected; moving first uploaded file {first} -> {dest}")
            shutil.move(first, dest)

        # finally run training
        try_run_training(DATA_LOCAL)

# NOTE:
# - Make sure your repo (with src/train.py) is present in the Colab VM.
#   If you haven't uploaded your project files, upload the `src/` folder or clone your repo first:
#     !git clone <your-repo-url>
# - Ensure src/train.py uses parse_known_args() (the version we discussed).
# - If training fails due to missing packages, run in a cell before this:
#     !pip install -r requirements.txt
#   or individually:
#     !pip install scikit-learn pandas joblib



CSV not found in data/ and not found in the example Drive path.
You can either:
  A) Upload the CSV now from your computer (recommended for a single file), or
  B) Mount Google Drive and place the CSV in Drive, then re-run this cell.

Choose Upload (A) by running the upload block below when prompted.



Saving product_reviews.csv to product_reviews.csv
Moving uploaded file product_reviews.csv -> data/product_reviews.csv

==> Running training with data: data/product_reviews.csv

Command: python -m src.train --data data/product_reviews.csv
/usr/bin/python3: Error while finding module specification for 'src.train' (ModuleNotFoundError: No module named 'src')

Process exited with code 1


In [None]:
!pip install scikit-learn pandas joblib




In [None]:
# ============================
# SINGLE CODE FOR COLAB
# Upload CSV → Save to data/ → Train model
# ============================

# 1) Upload the CSV from your Windows PC (Downloads folder)
from google.colab import files
uploaded = files.upload()   # Choose: product_reviews.csv

# 2) Move uploaded CSV into data/ folder
import os, shutil
os.makedirs("data", exist_ok=True)

csv_name = list(uploaded.keys())[0]        # get uploaded filename
destination = "data/product_reviews.csv"   # train.py will use this
shutil.move(csv_name, destination)

print("File saved to:", destination)

# 3) Install required packages
!pip install scikit-learn pandas joblib

# 4) Run your training script using the uploaded file
!python src/train.py --data data/product_reviews.csv


Saving product_reviews.csv to product_reviews.csv
File saved to: data/product_reviews.csv
python3: can't open file '/content/src/train.py': [Errno 2] No such file or directory


In [None]:
# src/preprocess.py
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def load_data(path="data/reviews.csv"):
    df = pd.read_csv(path)
    # map rating -> label if necessary
    if 'rating' in df.columns and 'label' not in df.columns:
        def map_rating(r):
            if r <= 2: return 'negative'
            if r == 3: return 'neutral'
            return 'positive'
        df['label'] = df['rating'].apply(map_rating)
    df = df.dropna(subset=['review'])
    return df

def clean_text(s, keep_emojis=False):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r"http\S+|www\S+|https\S+", "", s)
    s = re.sub(r"[^a-z0-9\s\!\?]", " ", s)  # keep punctuation tokens like ! ?
    tokens = [w for w in s.split() if w not in STOPWORDS]
    return " ".join(tokens)


In [None]:
# ========================================================
# 🔥 FINAL SINGLE CODE FOR COLAB (UPLOAD → LOAD → TRAIN)
# ========================================================

# 1️⃣ Upload your CSV file from Windows (Downloads folder)
from google.colab import files
uploaded = files.upload()   # choose product_reviews.csv

# 2️⃣ Save it into /content/data/
import os, shutil

os.makedirs("data", exist_ok=True)
csv_name = list(uploaded.keys())[0]               # get uploaded filename
csv_path = "data/product_reviews.csv"             # final path we will use

shutil.move(csv_name, csv_path)
print("CSV successfully saved to:", csv_path)

# 3️⃣ Define load_data() so it reads the uploaded file
import pandas as pd

def load_data():
    return pd.read_csv("data/product_reviews.csv")

# 4️⃣ Test loading (this will NOT fail now)
df = load_data()
print("Loaded rows:", len(df))
print(df.head())

# 5️⃣ If you want basic analysis (optional)
df['length'] = df['review'].astype(str).apply(lambda x: len(x.split()))
print(df.groupby('label')['length'].describe())

# 6️⃣ Train your model (assuming src/train.py exists)
!python src/train.py --data data/product_reviews.csv


Saving product_reviews.csv to product_reviews.csv
CSV successfully saved to: data/product_reviews.csv
Loaded rows: 200
                                         review     label
0  This product is amazing and works perfectly!  positive
1         Terrible product, completely useless.  negative
2   Very good quality, totally worth the money.  positive
3             Very bad quality, waste of money.  negative
4      Excellent purchase, I am very satisfied.  positive
          count  mean       std  min  25%  50%  75%  max
label                                                   
negative  100.0   5.6  0.804030  4.0  5.0  6.0  6.0  7.0
positive  100.0   6.2  1.172065  4.0  5.0  6.5  7.0  8.0
python3: can't open file '/content/src/train.py': [Errno 2] No such file or directory


In [None]:
# Single cell: show ALL filtered reviews+labels (scrollable) + full interactive filters
import os, math, shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# CONFIG: path to CSV
CSV_PATH = "data/product_reviews.csv"   # change if needed

# Load data
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at: {CSV_PATH}. Upload or change CSV_PATH.")

df = pd.read_csv(CSV_PATH)
if "review" not in df.columns or "label" not in df.columns:
    raise ValueError("CSV must include 'review' and 'label' columns.")
df = df.copy()
df["review"] = df["review"].astype(str)
df["label"] = df["label"].astype(str)
df["length"] = df["review"].apply(lambda s: len(str(s).split()))
df["char_count"] = df["review"].apply(len)

# Compute group stats
def compute_group_stats(df):
    g = df.groupby("label")["length"]
    stats = g.agg(['count', 'mean', 'std', 'min', lambda x: x.quantile(0.25), lambda x: x.quantile(0.5), lambda x: x.quantile(0.75), 'max'])
    stats.columns = ['count','mean','std','min','25%','50%','75%','max']
    stats['std'] = stats['std'].fillna(0.0)
    return stats

group_stats = compute_group_stats(df)

# -------- Widgets ----------
label_choices = sorted(df["label"].unique().tolist())
# SelectMultiple with all labels selected by default
label_sel = widgets.SelectMultiple(options=label_choices, value=tuple(label_choices), description="Labels:", rows=6)
select_all_btn = widgets.Button(description="Select All", layout=widgets.Layout(width='110px'))
clear_all_btn  = widgets.Button(description="Clear All",  layout=widgets.Layout(width='110px'))

keyword_txt = widgets.Text(value="", description="Keyword:")
regex_toggle = widgets.Checkbox(value=False, description="Regex (keyword)")

min_len = widgets.IntText(value=int(df["length"].min()), description="Min words:")
max_len = widgets.IntText(value=int(df["length"].max()), description="Max words:")

# Group-level sliders
gs = group_stats
def make_range_widget(stat_name, low, high, is_float=False):
    if is_float:
        return widgets.FloatRangeSlider(value=(low, high), min=low, max=high, step=(high-low)/100 if high>low else 0.1, description=stat_name, continuous_update=False)
    else:
        return widgets.IntRangeSlider(value=(int(low), int(high)), min=int(low), max=int(high), step=max(1, int((high-low)/50)) if high>low else 1, description=stat_name, continuous_update=False)

count_rng = make_range_widget("count", int(gs['count'].min()), int(gs['count'].max()))
mean_rng  = make_range_widget("mean", float(np.floor(gs['mean'].min())), float(np.ceil(gs['mean'].max())), is_float=True)
std_rng   = make_range_widget("std", float(np.floor(gs['std'].min())), float(np.ceil(gs['std'].max())), is_float=True)
min_rng   = make_range_widget("min", int(gs['min'].min()), int(gs['min'].max()))
p25_rng   = make_range_widget("25%", int(gs['25%'].min()), int(gs['25%'].max()))
p50_rng   = make_range_widget("50%", int(gs['50%'].min()), int(gs['50%'].max()))
p75_rng   = make_range_widget("75%", int(gs['75%'].min()), int(gs['75%'].max()))
max_rng   = make_range_widget("max", int(gs['max'].min()), int(gs['max'].max()))

group_filter_toggle = widgets.Checkbox(value=False, description="Enable group-level label filters")

apply_btn = widgets.Button(description="Apply filters", button_style="primary")
export_btn = widgets.Button(description="Export filtered CSV", button_style="success")
out = widgets.Output(layout={'border': '1px solid black'})

# -------- Helper functions ----------
def labels_meeting_group_filters(gs_df):
    if not group_filter_toggle.value:
        return list(gs_df.index)
    mask = pd.Series(True, index=gs_df.index)
    mask &= (gs_df['count'] >= int(count_rng.value[0])) & (gs_df['count'] <= int(count_rng.value[1]))
    mask &= (gs_df['mean'] >= float(mean_rng.value[0])) & (gs_df['mean'] <= float(mean_rng.value[1]))
    mask &= (gs_df['std'] >= float(std_rng.value[0])) & (gs_df['std'] <= float(std_rng.value[1]))
    mask &= (gs_df['min'] >= int(min_rng.value[0])) & (gs_df['min'] <= int(min_rng.value[1]))
    mask &= (gs_df['25%'] >= int(p25_rng.value[0])) & (gs_df['25%'] <= int(p25_rng.value[1]))
    mask &= (gs_df['50%'] >= int(p50_rng.value[0])) & (gs_df['50%'] <= int(p50_rng.value[1]))
    mask &= (gs_df['75%'] >= int(p75_rng.value[0])) & (gs_df['75%'] <= int(p75_rng.value[1]))
    mask &= (gs_df['max'] >= int(max_rng.value[0])) & (gs_df['max'] <= int(max_rng.value[1]))
    return list(gs_df.index[mask])

def get_filtered_df():
    gs_cur = compute_group_stats(df)
    labels_allowed_by_group = labels_meeting_group_filters(gs_cur)
    mask = pd.Series(True, index=df.index)
    selected_labels = list(label_sel.value)
    if selected_labels:
        mask &= df['label'].isin(selected_labels)
    mask &= df['label'].isin(labels_allowed_by_group)
    kw = keyword_txt.value.strip()
    if kw:
        if regex_toggle.value:
            mask &= (df['review'].str.contains(kw, regex=True, na=False) | df['label'].str.contains(kw, regex=True, na=False))
        else:
            mask &= (df['review'].str.contains(kw, case=False, na=False) | df['label'].str.contains(kw, case=False, na=False))
    mask &= (df['length'] >= int(min_len.value)) & (df['length'] <= int(max_len.value))
    return df[mask].copy()

def render_html_table(dframe, max_height_px=400):
    """Return scrollable HTML table for display showing all rows in a scroll box."""
    # Use full table but limit height with CSS
    html = dframe.to_html(index=False, classes="table table-striped", escape=True)
    styled = f"""
    <div style="max-height:{max_height_px}px; overflow:auto; border:1px solid #ccc; padding:8px;">
      {html}
    </div>
    """
    return HTML(styled)

# -------- Actions ----------
def apply_filters(b=None):
    filtered = get_filtered_df()
    gs_cur = compute_group_stats(df)
    labels_allowed = labels_meeting_group_filters(gs_cur)
    with out:
        clear_output(wait=True)
        print(f"Total rows in file: {len(df)}")
        print(f"Rows after filter: {len(filtered)}")
        print("\nLabel counts (filtered):")
        display(filtered['label'].value_counts())
        print("\nCurrent group-level stats (per label):")
        display(gs_cur)
        if group_filter_toggle.value:
            print("Labels allowed by group filters:", labels_allowed)
        else:
            print("Group filters disabled — all labels allowed.")
        print("\nGrouped descriptive stats for 'length' (filtered):")
        display(filtered.groupby("label")["length"].describe())

        # Plots
        sns.set(style="whitegrid")
        plt.figure(figsize=(6,3))
        sns.countplot(data=filtered, x="label")
        plt.title("Label counts (filtered)")
        plt.tight_layout(); plt.show()

        plt.figure(figsize=(8,3.5))
        sns.histplot(filtered["length"], bins=15)
        plt.title("Histogram of review length (words) - combined (filtered)")
        plt.tight_layout(); plt.show()

        if filtered["label"].nunique() > 0:
            g = sns.FacetGrid(filtered, col="label", height=3.5, aspect=1)
            g.map_dataframe(sns.histplot, x="length", hue="label", palette="Set2", legend=False, bins=15)
            plt.subplots_adjust(top=0.85)
            g.fig.suptitle("Faceted histogram by label (filtered)")
            plt.show()

        if filtered["label"].nunique() > 1:
            plt.figure(figsize=(8,3.5))
            sns.kdeplot(data=filtered, x="length", hue="label", fill=True, palette="Set2", legend=True)
            plt.title("KDE of length by label (filtered)")
            plt.tight_layout(); plt.show()

        plt.figure(figsize=(6,3.5))
        sns.boxplot(data=filtered, x="label", y="length")
        plt.title("Boxplot: length by label (filtered)")
        plt.tight_layout(); plt.show()

        print("\nFiltered data (scrollable):")
        display(render_html_table(filtered[["review","label","length","char_count"]], max_height_px=450))

def select_all_labels(b):
    label_sel.value = tuple(label_choices)

def clear_all_labels(b):
    label_sel.value = tuple([])

def export_filtered(b):
    filtered = get_filtered_df()
    out_path = "filtered_reviews.csv"
    filtered.to_csv(out_path, index=False)
    with out:
        print(f"Exported {len(filtered)} rows to {out_path}")
        display(HTML(f'<a href="{out_path}" target="_blank">Download filtered_reviews.csv</a>'))

# wire up
apply_btn.on_click(apply_filters)
select_all_btn.on_click(select_all_labels)
clear_all_btn.on_click(clear_all_labels)
export_btn.on_click(export_filtered)

# layout
label_buttons = widgets.HBox([select_all_btn, clear_all_btn, export_btn])
row_filters_box = widgets.VBox([widgets.HTML(value="<b>Row-level filters</b>"),
                                widgets.Label("Select labels (multi-select)"),
                                label_sel, label_buttons,
                                keyword_txt, regex_toggle,
                                widgets.HBox([min_len, max_len]),
                                apply_btn])

group_filters_box = widgets.VBox([widgets.HTML(value="<b>Group-level label filters</b>"),
                                  group_filter_toggle,
                                  count_rng, mean_rng, std_rng, min_rng, p25_rng, p50_rng, p75_rng, max_rng])

display(widgets.HBox([row_filters_box, group_filters_box], layout=widgets.Layout(align_items='flex-start')))
display(out)

# initial run
apply_filters()


HBox(children=(VBox(children=(HTML(value='<b>Row-level filters</b>'), Label(value='Select labels (multi-select…

Output(layout=Layout(border='1px solid black'))

In [None]:
# Run this once at the top of your notebook to guarantee 'clean_review' exists
import os, re, pandas as pd

CSV_PATH = "data/product_reviews.csv"   # adjust if needed

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at: {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

# If you already have 'clean_review', do nothing (but normalize it a bit)
if "clean_review" in df.columns:
    print("Found existing column 'clean_review' — normalizing values.")
    df["clean_review"] = df["clean_review"].astype(str).str.strip()
else:
    # If 'review' exists, create 'clean_review' from it
    if "review" in df.columns:
        print("Column 'clean_review' not found — creating from 'review' with basic cleaning.")
        def simple_clean(text):
            # basic cleaning: lowercasing, remove punctuation except internal apostrophes, collapse whitespace
            s = str(text).lower().strip()
            # remove punctuation except apostrophes and hyphens
            s = re.sub(r"[^\w\s'-]", " ", s)
            s = re.sub(r"\s+", " ", s)
            return s
        df["clean_review"] = df["review"].apply(simple_clean)
    else:
        raise KeyError("Neither 'clean_review' nor 'review' columns found in the CSV. Please provide one of them.")

# If you rely on 'label' too, ensure it's string
if "label" not in df.columns:
    raise KeyError("Missing required column 'label' in the CSV.")
df["label"] = df["label"].astype(str)

# Save back to CSV (optional) or continue using df in-memory
# df.to_csv(CSV_PATH, index=False)  # uncomment if you want to persist

print("Sample rows (clean_review, label):")
print(df[["clean_review","label"]].head(8).to_string(index=False))

# Assign back to the notebook variable expected by the rest of the code
# If your notebook expects variable name `df` (as in previous cells), this is already done.
# If other cells import the CSV again, make sure they use this df or re-run them after this cell.


Column 'clean_review' not found — creating from 'review' with basic cleaning.
Sample rows (clean_review, label):
                                 clean_review    label
 this product is amazing and works perfectly  positive
         terrible product completely useless  negative
   very good quality totally worth the money  positive
             very bad quality waste of money  negative
      excellent purchase i am very satisfied  positive
              stopped working after one week  negative
the build quality is great and feels premium  positive
             cheap material and poorly built  negative


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("data/product_reviews.csv")   # change path if needed

# Prepare features
if "clean_review" in df.columns:
    X = df["clean_review"].astype(str)
elif "review" in df.columns:
    X = df["review"].astype(str)
else:
    raise KeyError("No 'review' or 'clean_review' column found.")

# Prepare labels
if "label" not in df.columns:
    raise KeyError("No 'label' column found.")

y = df["label"].astype(str)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ----- OUTPUT -----
print("Train size:", len(X_train))
print("Test size :", len(X_test))

print("\nSample X_train values:")
print(X_train.head(10).to_string(index=True))  # show first 10 training feature rows

print("\nSample y_train values:")
print(y_train.head(10).to_string(index=True))  # show first 10 training label rows


Train size: 160
Test size : 40

Sample X_train values:
109             I am very disappointed, not recommended.
150    Fast delivery and the product exceeded expecta...
157                 Performance is awful and unreliable.
0           This product is amazing and works perfectly!
116                  Value for money. I would buy again.
170    Fast delivery and the product exceeded expecta...
33                Does not match the description at all.
87                      Cheap material and poorly built.
147                     Cheap material and poorly built.
115             Horrible experience, will not buy again.

Sample y_train values:
109    negative
150    positive
157    negative
0      positive
116    positive
170    positive
33     negative
87     negative
147    negative
115    negative


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=20000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=2000, class_weight='balanced'))
])

param_grid = {
    "tfidf__max_features": [10000, 20000],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 10]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid.fit(df['review'], df['label'])
print(grid.best_params_, grid.best_score_)
model = grid.best_estimator_


{'clf__C': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 1)} 1.0


In [None]:
# Single-file end-to-end sentiment pipeline (fixed)
import os
import re
import joblib
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# -------- CONFIG --------
CSV_PATH = "data/product_reviews.csv"
MODEL_OUT = "models/tfidf_logreg.pkl"
TEST_SIZE = 0.20
RANDOM_STATE = 42
# ------------------------

# 1) Load CSV
df = pd.read_csv(CSV_PATH)

if "label" not in df.columns:
    raise KeyError("CSV must have 'label' column.")

if "clean_review" not in df.columns and "review" not in df.columns:
    raise KeyError("CSV must include 'clean_review' or 'review'.")

# 2) Ensure clean_review exists
def clean(t):
    t = str(t).lower().strip()
    t = re.sub(r"[^\w\s'-]", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t

if "clean_review" in df.columns:
    df["clean_review"] = df["clean_review"].astype(str).apply(clean)
else:
    df["clean_review"] = df["review"].astype(str).apply(clean)

# 3) Prepare X, y
X = df["clean_review"]
y = df["label"].astype(str)

# 4) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

print(f"Train size: {len(X_train)}")
print(f"Test size : {len(X_test)}")
print("-" * 60)

# 5) Pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=15000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=2000))
])

# 6) Train
pipeline.fit(X_train, y_train)
print("Training complete.")
print("-" * 60)

# 7) Predict
y_pred = pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, digits=4))

# --- FIXED PART HERE ---
labels = np.unique(
    np.concatenate([
        y_test.unique(),           # pandas Series unique()
        np.unique(y_pred)          # numpy correct usage
    ])
)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("\nConfusion Matrix:")
display(cm_df)

# 8) Save model
os.makedirs(os.path.dirname(MODEL_OUT), exist_ok=True)
joblib.dump(pipeline, MODEL_OUT)
print(f"\nSaved model to: {MODEL_OUT}")


Train size: 160
Test size : 40
------------------------------------------------------------
Training complete.
------------------------------------------------------------
Classification Report:

              precision    recall  f1-score   support

    negative     1.0000    1.0000    1.0000        20
    positive     1.0000    1.0000    1.0000        20

    accuracy                         1.0000        40
   macro avg     1.0000    1.0000    1.0000        40
weighted avg     1.0000    1.0000    1.0000        40


Confusion Matrix:


Unnamed: 0,negative,positive
negative,20,0
positive,0,20



Saved model to: models/tfidf_logreg.pkl



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)  # correct variable name

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
print(cm)


              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        20
    positive       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

[[20  0]
 [ 0 20]]


In [None]:
import joblib
joblib.dump(model, "models/tfidf_logreg.pkl")


['models/tfidf_logreg.pkl']

In [None]:
# src/predict.py
import joblib
model = joblib.load("models/tfidf_logreg.pkl")
def predict_text(text):
    return model.predict([text])[0]

print(predict_text("Battery died in 2 days - terrible!"))


negative


In [None]:
# src/app.py
from fastapi import FastAPI
import joblib
app = FastAPI()
model = joblib.load("models/tfidf_logreg.pkl")

@app.post("/predict")
async def predict(payload: dict):
    text = payload.get("text", "")
    label = model.predict([text])[0]
    proba = model.predict_proba([text]).max()
    return {"label": label, "confidence": float(proba)}


In [None]:
# src/app.py
from fastapi import FastAPI
import joblib
app = FastAPI()
model = joblib.load("models/tfidf_logreg.pkl")

@app.post("/predict")
async def predict(payload: dict):
    text = payload.get("text", "")
    label = model.predict([text])[0]
    proba = model.predict_proba([text]).max()
    return {"label": label, "confidence": float(proba)}


In [None]:
!pip install datasets transformers accelerate torch scikit-learn




In [None]:
# ==========================
# SINGLE CODE — UPLOAD + SAVE + LOAD CSV
# ==========================

import os, shutil
import pandas as pd
from google.colab import files

print("📌 Step 1: Upload your CSV file")
uploaded = files.upload()   # choose your file: product_reviews.csv / reviews.csv etc.

# Create data/ folder
os.makedirs("data", exist_ok=True)

# Move uploaded file into data/
csv_path = None
for fname in uploaded.keys():
    csv_path = "data/" + fname
    shutil.move(fname, csv_path)
    print(f"✅ File saved as: {csv_path}")

print("\n📌 Step 2: Loading the CSV…")

# Load the file
df = pd.read_csv(csv_path)

print("✅ Loaded successfully!")
print("\n🔹 First 5 rows:")
print(df.head())

print("\n🔹 Columns in file:")
print(df.columns.tolist())


📌 Step 1: Upload your CSV file


Saving product_reviews.csv to product_reviews.csv
✅ File saved as: data/product_reviews.csv

📌 Step 2: Loading the CSV…
✅ Loaded successfully!

🔹 First 5 rows:
                                         review     label
0  This product is amazing and works perfectly!  positive
1         Terrible product, completely useless.  negative
2   Very good quality, totally worth the money.  positive
3             Very bad quality, waste of money.  negative
4      Excellent purchase, I am very satisfied.  positive

🔹 Columns in file:
['review', 'label']


In [None]:
# Resume training WITHOUT W&B prompts (single cell)
import os
os.environ["WANDB_DISABLED"] = "true"   # fully disable wandb
# also (optional) prevent huggingface from trying to call wandb in other ways:
os.environ["WANDB_MODE"] = "offline"

# If you created dataset, tokenizer, model earlier in the session, reuse them.
# Otherwise re-import minimal pieces (safe to run even if already imported).
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np

# --- Config (adjust model_name / CSV if needed) ---
model_name = "distilbert-base-uncased"
csv_path = "data/product_reviews.csv"

# --- (Re)load dataset & tokenizer if not present ---
dataset = load_dataset("csv", data_files=csv_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["review"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)

# label encoding (recreate if needed)
label_list = list(set(dataset["train"]["label"]))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

dataset = dataset.map(encode_labels)

# metric
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

# load model (classification head will be randomly init if needed — that's fine)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

# TrainingArguments: set report_to="none" (if supported) and keep save_steps/logging_steps
train_args = TrainingArguments(
    output_dir="bert_output",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    save_steps=500,
    logging_steps=50,
    # try to ensure no remote reporting:
    report_to="none"  # if old HF, this may be ignored but WANDB_DISABLED handles it
)

# Create Trainer and train
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Starting training (W&B disabled). If this cell runs, wandb won't prompt you.")
trainer.train()
print("Training finished.")


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training (W&B disabled). If this cell runs, wandb won't prompt you.




Step,Training Loss
50,0.4317


Training finished.


In [None]:
# quick_demo.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re

# tiny dataset
data = [
    ("Love it! Arrived quickly and works as expected", "positive"),
    ("Terrible product. Broke in one day", "negative"),
    ("Okay, battery life could be better", "neutral"),
    ("Awesome quality, highly recommend!", "positive"),
    ("Stopped working after a week", "negative")
]
df = pd.DataFrame(data, columns=['review','label'])

def clean(s):
    s = s.lower()
    s = re.sub(r"http\S+","",s)
    return s

df['review'] = df['review'].apply(clean)
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['label'], test_size=0.4, random_state=42)
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2))),
    ("clf", LogisticRegression())
])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       2.0
    positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
