<a href="https://colab.research.google.com/github/Bentleybb/Binli_DataAnalyst.github.io/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from scipy.sparse import hstack, csr_matrix
import numpy as np

nltk.download("movie_reviews")
nltk.download("vader_lexicon")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:

def load_movie_reviews_dataset():

    texts = []
    labels = []

    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            texts.append(movie_reviews.raw(fileid))
            labels.append(category)

    print(f"Loaded {len(texts)} movie reviews.")
    return texts, labels


# VADER Feature Extractor
def get_vader_features(texts, sia):

    vader_features = []
    for t in texts:
        s = sia.polarity_scores(t)
        vader_features.append([
            s["compound"],
            s["pos"],
            s["neu"],
            s["neg"]
        ])
    return np.array(vader_features, dtype=float)


# Train Hybrid Model
def train_hybrid_model(texts, labels):

    sia = SentimentIntensityAnalyzer()

    # TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=5000
    )

    X_tfidf = tfidf_vectorizer.fit_transform(texts)
    y = np.array(labels)

    # VADER features
    X_vader = get_vader_features(texts, sia)
    X_vader_sparse = csr_matrix(X_vader)

    # Hybrid features = TF-IDF + VADER
    X_hybrid = hstack([X_tfidf, X_vader_sparse])

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_hybrid,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    # Logistic Regression classifier
    model = LogisticRegression(max_iter=300, n_jobs=-1)
    model.fit(X_train, y_train)

    print("Hybrid model training complete (TF-IDF + VADER).")

    return tfidf_vectorizer, sia, model, X_test, y_test


# Evaluate Model
def evaluate_model(model, X_test, y_test):
    """
    Print accuracy, classification report, and confusion matrix.
    """
    y_pred = model.predict(X_test)

    print("\n========== Evaluation on Test Set ==========")
    print(classification_report(y_test, y_pred, digits=3))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("============================================\n")


# Predict Single Review
def predict_single_review(text, tfidf_vectorizer, sia, model):

    # TF-IDF features
    X_tfidf = tfidf_vectorizer.transform([text])

    # VADER features
    s = sia.polarity_scores(text)
    vader_vector = np.array([[s["compound"], s["pos"], s["neu"], s["neg"]]])
    X_vader = csr_matrix(vader_vector)

    # Hybrid
    X_hybrid = hstack([X_tfidf, X_vader])

    # Prediction
    pred_label = model.predict(X_hybrid)[0]

    # Probability distribution
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_hybrid)[0]
        classes = model.classes_
        proba_dict = {cls: float(p) for cls, p in zip(classes, probs)}
    else:
        proba_dict = {}

    return pred_label, proba_dict, s


# Interactive CLI App
def interactive_app(tfidf_vectorizer, sia, model):

    print("=== Hybrid Sentiment Analysis App (TF-IDF + VADER + ML) ===")
    print("Type a review and I will analyze its sentiment.")
    print("Type 'quit' or 'exit' to stop.\n")

    while True:
        text = input("Enter a review: ").strip()
        if text.lower() in ("quit", "exit"):
            print("Exiting application. Goodbye!")
            break
        if not text:
            continue

        pred_label, proba, vader_scores = predict_single_review(
            text, tfidf_vectorizer, sia, model
        )

        print("\n--- Result ---")
        print(f"Input: {text}")
        print(f"Predicted Sentiment: {pred_label.upper()}")

        if proba:
            print("\nClass Probabilities:")
            for cls, p in proba.items():
                print(f"  {cls}: {p:.3f}")

        print("\nVADER Scores:")
        for k, v in vader_scores.items():
            print(f"  {k}: {v:.4f}")

        print("------------------------\n")


# Main
def main():
    texts, labels = load_movie_reviews_dataset()
    tfidf_vectorizer, sia, model, X_test, y_test = train_hybrid_model(texts, labels)
    evaluate_model(model, X_test, y_test)
    interactive_app(tfidf_vectorizer, sia, model)


if __name__ == "__main__":
    main()



Loaded 2000 movie reviews.
Hybrid model training complete (TF-IDF + VADER).

              precision    recall  f1-score   support

         neg      0.799     0.735     0.766       200
         pos      0.755     0.815     0.784       200

    accuracy                          0.775       400
   macro avg      0.777     0.775     0.775       400
weighted avg      0.777     0.775     0.775       400

Confusion Matrix:
[[147  53]
 [ 37 163]]

=== Hybrid Sentiment Analysis App (TF-IDF + VADER + ML) ===
Type a review and I will analyze its sentiment.
Type 'quit' or 'exit' to stop.

Enter a review: damn good 

--- Result ---
Input: damn good
Predicted Sentiment: POS

Class Probabilities:
  neg: 0.281
  pos: 0.719

VADER Scores:
  neg: 0.4820
  neu: 0.0000
  pos: 0.5180
  compound: 0.0516
------------------------

Enter a review: i fall sallep, i dont know why people like it. the movie is kind of strange 

--- Result ---
Input: i fall sallep, i dont know why people like it. the movie is kin