In [1]:
import pandas as pd

# Replace 'path_to_dataset.csv' with the uploaded file's name
data = pd.read_csv("path_to_dataset.csv", encoding= 'latin-1')
print(data.head())

                        Review_Text  Sentiment
0             This book is amazing!          1
1              Not worth the money.          0
2                I loved the story!          1
3  Poor quality, very disappointed.          0
4        The writing was beautiful.          1


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec

data = pd.read_csv("path_to_dataset.csv", encoding= 'latin-1')

data.columns = ["Review_Text", "Sentiment"] # Renaming the selected columns

# Step 2: Preprocess the Text
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    return text

data["Review_Text"] = data["Review_Text"].apply(preprocess_text)

# ... (rest of the code remains the same)

# Step 3: Feature Extraction
# Option 1: TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(data["Review_Text"])

# Option 2: Word2Vec
sentences = [review.split() for review in data["Review_Text"]]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_word2vec_features(text):
    words = text.split()
    vector = np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0)
    return vector

word2vec_features = np.array([get_word2vec_features(review) for review in data["Review_Text"]])
# Step 4: Train a Classifier
def train_and_evaluate(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # Using models that can handle dense feature vectors
    models = {
        "Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
    }

    results = {}
    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        # Changed 'binary' to 'weighted' for multiclass averaging
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
        }
    return results
# TF-IDF Model Results
print("Evaluating TF-IDF features")
tfidf_results = train_and_evaluate(tfidf_features, data["Sentiment"])

# Word2Vec Model Results
print("Evaluating Word2Vec features")
word2vec_results = train_and_evaluate(word2vec_features, data["Sentiment"])

# Step 5: Compare Results
print("TF-IDF Results:")
print(pd.DataFrame(tfidf_results).T)

print("\nWord2Vec Results:")
print(pd.DataFrame(word2vec_results).T)



Evaluating TF-IDF features
Evaluating Word2Vec features
TF-IDF Results:
                     Accuracy  Precision  Recall  F1-Score
Logistic Regression       0.0        1.0     0.0       0.0
SVM                       0.0        1.0     0.0       0.0

Word2Vec Results:
                     Accuracy  Precision  Recall  F1-Score
Logistic Regression       0.0        1.0     0.0       0.0
SVM                       0.0        1.0     0.0       0.0
