# Loading

In [None]:
from utils import load_master_df, balance_master_df_classes

master_df = load_master_df()
master_df = balance_master_df_classes(master_df)

# BoW (Bag of Words)

- Predictor: BoW vector
- Target: sentiment_category

Steps:
1. Train test split the sentences and sentiment_category
2. Fit the BoW vectorizer to the training sentences
3. Transform both the training sentences and test sentences to BoW vectors using the trained BoW vectorizer
4. Test and validate with different models

In [14]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define features (X) and target (y)
X = master_df['sentence']  # or 'tokenized_sentence' if you prefer to feed token lists to a custom vectorizer
y = master_df['sentiment_category']

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1111, stratify=y)

###############################################################################
# 1) Vectorize using Bag-of-Words (BoW)
###############################################################################
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=1111),
    "Decision Tree": DecisionTreeClassifier(random_state=1111),
    "Random Forest": RandomForestClassifier(random_state=1111),
}

print("========== USING BAG OF WORDS (BoW) ==========\n")
for model_name, model in models.items():
    # Train
    model.fit(X_train_bow, y_train)
    # Predict
    y_pred = model.predict(X_test_bow)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted to handle class imbalance
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score (weighted): {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Feature Importances / Coefficients (Top 10):")
    feature_names = np.array(bow_vectorizer.get_feature_names_out())
    
    if hasattr(model, 'feature_importances_'):
        # For DecisionTree and RandomForest
        importances = model.feature_importances_
        # Sort by descending importance
        sorted_idx = np.argsort(importances)[::-1]
        top_n = 10  # top 10
        for idx in sorted_idx[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]:.4f}")
        
    elif hasattr(model, 'coef_'):
        # For Logistic Regression
        # If multi-class, model.coef_.shape is (n_classes, n_features).
        # One way is to average the absolute values across classes:
        coefs = model.coef_
        importances = np.mean(np.abs(coefs), axis=0)  # shape: (n_features,)
        sorted_idx = np.argsort(importances)[::-1]
        top_n = 10
        for idx in sorted_idx[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]:.4f}")
    print("-"*50)


Model: Logistic Regression
Accuracy: 0.8633
F1-Score (weighted): 0.8626
Classification Report:
              precision    recall  f1-score   support

    negative     0.8771    0.9100    0.8933       400
     neutral     0.8705    0.7900    0.8283       400
    positive     0.8436    0.8900    0.8662       400

    accuracy                         0.8633      1200
   macro avg     0.8637    0.8633    0.8626      1200
weighted avg     0.8637    0.8633    0.8626      1200

Feature Importances / Coefficients (Top 10):
pleased: 1.4685
low: 1.3310
suck: 1.3239
defective: 1.2859
unfortunately: 1.2550
junk: 1.2208
excellent: 1.2178
hell: 1.1913
awesome: 1.1778
best: 1.1662
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8717
F1-Score (weighted): 0.8706
Classification Report:
              precision    recall  f1-score   support

    negative     0.8542    0.9375    0.8939       400
     neutral     0.8997    0.7850    0.8385       400
    positive     0.86

# TF-IDF (text frequency / inverse doc frequency)
- Predictor: TF-IDF vector
- Target: sentiment_category

Steps:
1. Train test split the sentences and sentiment_category
2. Fit the TF-IDF vectorizer to the training sentences
3. Transform both the training sentences and test sentences to TF-IDF vectors using the trained TF-IDF vectorizer
4. Test and validate with different models

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("========== USING TF-IDF ==========\n")
for model_name, model in models.items():
    # Train
    model.fit(X_train_tfidf, y_train)
    # Predict
    y_pred = model.predict(X_test_tfidf)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted to handle class imbalance
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score (weighted): {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    print("Feature Importances / Coefficients (Top 10):")
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    
    if hasattr(model, 'feature_importances_'):
        # For DecisionTree and RandomForest
        importances = model.feature_importances_
        sorted_idx = np.argsort(importances)[::-1]
        top_n = 10
        for idx in sorted_idx[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]:.4f}")

    elif hasattr(model, 'coef_'):
        # For Logistic Regression
        coefs = model.coef_
        importances = np.mean(np.abs(coefs), axis=0)
        sorted_idx = np.argsort(importances)[::-1]
        top_n = 10
        for idx in sorted_idx[:top_n]:
            print(f"{feature_names[idx]}: {importances[idx]:.4f}")

    print("-"*50)


Model: Logistic Regression
Accuracy: 0.8158
F1-Score (weighted): 0.8153
Classification Report:
              precision    recall  f1-score   support

    negative     0.8469    0.8575    0.8522       400
     neutral     0.7916    0.7500    0.7702       400
    positive     0.8077    0.8400    0.8235       400

    accuracy                         0.8158      1200
   macro avg     0.8154    0.8158    0.8153      1200
weighted avg     0.8154    0.8158    0.8153      1200

Feature Importances / Coefficients (Top 10):
easy: 2.2783
pleased: 1.8293
great: 1.7523
best: 1.7181
excellent: 1.6575
fast: 1.6376
low: 1.5074
suck: 1.4837
love: 1.4656
lens: 1.4244
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8675
F1-Score (weighted): 0.8655
Classification Report:
              precision    recall  f1-score   support

    negative     0.8770    0.9450    0.9097       400
     neutral     0.8776    0.7525    0.8102       400
    positive     0.8498    0.9050    