In [13]:
# sentiment_analysis.py
"""
Sentiment Analysis of Product Reviews
Author: [Daphne Christy J]
Description:
This script classifies product reviews as Positive, Neutral, or Negative using
Natural Language Processing (NLP) and a machine learning model.
"""

import pandas as pd
import numpy as np
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib

# =========================
#  DATA LOADING
# =========================
def load_data(file_path):
    """
    Load CSV file containing reviews with 'review' and 'sentiment' columns.
    """
    # Try specifying the delimiter. If it's not a comma, change 'sep=',' to the correct one.
    # If there are bad lines causing errors, you can use error_bad_lines=False (for older pandas)
    # or on_bad_lines='skip' or 'warn' (for newer pandas versions).
    df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip', names=['review', 'sentiment'], header=0)
    df.dropna(subset=['review', 'sentiment'], inplace=True)
    return df

# =========================
#  TEXT PREPROCESSING
# =========================
def clean_text(text):
    """
    Lowercase, remove punctuation and special characters
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# =========================
#  MODEL TRAINING
# =========================
def train_model(df):
    """
    Train a simple TF-IDF + Naive Bayes classifier
    """
    df['clean_review'] = df['review'].apply(clean_text)
    X = df['clean_review']
    y = df['sentiment']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train_vec, y_train)

    # Evaluate
    y_pred = model.predict(X_test_vec)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # Save model and vectorizer
    joblib.dump(model, "sentiment_model.pkl")
    joblib.dump(vectorizer, "vectorizer.pkl")
    print(" Model and vectorizer saved!")

    return model, vectorizer

# =========================
#  PREDICTION FUNCTION
# =========================
def predict_sentiment(review, model, vectorizer):
    review_clean = clean_text(review)
    review_vec = vectorizer.transform([review_clean])
    prediction = model.predict(review_vec)[0]
    return prediction

# =========================
#  MAIN FUNCTION
# =========================
if __name__ == "__main__":
    # Example CSV file path
    file_path = "reviews.csv"  # Replace with your CSV file
    df = load_data(file_path)

    model, vectorizer = train_model(df)

    # Test predictions
    test_reviews = [
        "Love it!!",
        "Total waste of money.",
        "It’s okay, nothing special."
    ]

    print("\n Sample Predictions:")
    for rev in test_reviews:
        pred = predict_sentiment(rev, model, vectorizer)
        print(f"Review: {rev}\nPredicted Sentiment: {pred}\n")

Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       1.0
    Positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

 Model and vectorizer saved!

 Sample Predictions:
Review: Love it!!
Predicted Sentiment: Positive

Review: Total waste of money.
Predicted Sentiment: Positive

Review: It’s okay, nothing special.
Predicted Sentiment: Neutral



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd

file_path = "reviews.csv"  # Replace with your CSV file

try:
    # Attempt to read with semicolon delimiter and skip bad lines
    df = pd.read_csv(file_path, sep=';', on_bad_lines='skip')
    print("Successfully loaded with semicolon delimiter, skipping bad lines.")
    display(df.head())
except Exception as e:
    print(f"Could not load with semicolon delimiter: {e}")
    # If semicolon doesn't work, you might need to try other delimiters or inspect the file manually.
    print("Please manually inspect the 'reviews.csv' file to determine the correct delimiter and check for any issues on line 4.")

Successfully loaded with semicolon delimiter, skipping bad lines.


Unnamed: 0,review\tsentiment
0,Love it!!\tPositive
1,Total waste of money.\tNegative
2,"It’s okay, nothing special.\tNeutral"
3,"Excellent quality, will buy again\tPositive"
4,"Very bad, do not recommend\tNegative"
