Task 2: Sentiment Analysis with Natural Language Processing

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv(r"C:\Users\dhana\Downloads\large_synthetic_reviews.csv")

# Initialize stopwords, lemmatizer, and punctuation list
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return " ".join(processed_tokens)

# Apply preprocessing to the "Review Text" column
df["Processed Review"] = df["Review Text"].apply(preprocess_text)

# Display the first few rows of the processed dataset
df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Review Text,Sentiment,Processed Review
0,Extremely durable and reliable.,Positive,extremely durable reliable
1,Extremely durable and reliable.,Positive,extremely durable reliable
2,"Great customer support, resolved my issue quic...",Positive,great customer support resolved issue quickly
3,The worst product I've ever used.,Negative,worst product ive ever used
4,"Battery life is awful, wouldn't recommend.",Negative,battery life awful wouldnt recommend


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

doc = pd.read_csv(r"C:\Users\dhana\Downloads\large_synthetic_reviews.csv")
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(doc)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:", feature_names)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


Feature Names: ['review' 'sentiment' 'text']
TF-IDF Matrix:
 [[0.70710678 0.         0.70710678]
 [0.         1.         0.        ]]


In [17]:
vectorizer_large = TfidfVectorizer(max_features=5000)
X_large_tfidf = vectorizer_large.fit_transform(doc["Review Text"])
y_large = doc["Sentiment"]
X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(
    X_large_tfidf, y_large, test_size=0.2, random_state=42, stratify=y_large)
model_large = LogisticRegression()
model_large.fit(X_train_large, y_train_large)
y_pred_large = model_large.predict(X_test_large)
accuracy_large = accuracy_score(y_test_large, y_pred_large)
classification_rep_large = classification_report(y_test_large, y_pred_large)
print(f"Accuracy: {accuracy_large * 100:.2f}%")
print("Classification Report:\n", classification_rep_large)

Accuracy: 100.00%
Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        20
    Positive       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate individual metrics
precision = precision_score(y_test_large, y_pred_large, pos_label="Positive")
recall = recall_score(y_test_large, y_pred_large, pos_label="Positive")
f1 = f1_score(y_test_large, y_pred_large, pos_label="Positive")

# Print Evaluation Metrics
print(f"Accuracy: {accuracy_large * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


Accuracy: 100.00%
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
