In [1]:
#import pandas module for creating dataframe
import pandas as pd
import csv

#import regular expression library
import re

#read CSV into DataFrame
data = pd.read_csv("Review.csv")

#import string module for string manipulation
import string

#defining the function to remove punctuations in the documents
def remove_punctuation(text):
    #punctuationfree = "".join([i for i in text if i not in string.punctuation])
    #return punctuationfree
    # Initialize an empty string to store the result
    punctuation_free = ""
    
    # Iterate over each character in the text
    for i in text:
        # Check if the character is not in the string.punctuation set
        if i not in string.punctuation:
            # If not, add the character to the result string
            punctuation_free += i
    
    return punctuation_free

#applying the remove_punctuation function to the 'Review' column and storing the result in a new column 'clean_punctuation'
data['clean_punctuation']= data['Text'].apply(remove_punctuation)

#to standardize the cases in the documents into lower case
data['clean_lower']= data['clean_punctuation'].str.lower()

#function to remove digit (\d) or hypens (-) from the documents with an empty string ''
def remove_numbers(text):
    return re.sub("[\d-]",'',text)

#applying the remove_numbers function to the 'clean_lower' column and storing the result in a new column 'clean_number'
data['clean_number'] = data['clean_lower'].apply(remove_numbers)


data_to_write = data['clean_number']  # Assuming data is a dictionary 

# Path to the CSV file you want to write into
csv_file_path = "Reviews.csv"

# Define your header
header = ['Text']  

# Writing data to the CSV file
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(header) 

    # Write each item of the list as a separate row in the CSV file
    for item in data_to_write:
        writer.writerow([item])

print("Data has been written to", csv_file_path)

Data has been written to Reviews.csv


# Machine Learning

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pandas as pd

# Read the CSV file
df = pd.read_csv('Reviews.csv', encoding='ISO-8859-1')

# Extract 'Text' and 'Score' columns
data = df[['Score', 'Text']]

# Split data into features and labels
texts = data['Text']
labels = data['Score']

# Convert labels to binary sentiment classes (positive: 1, negative: 0)
labels = labels.apply(lambda x: 1 if x > 3 else 0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.4, random_state=42)

# Extract features (bag of words representation)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize classifiers
nb_classifier = MultinomialNB()
svm_classifier = SVC(kernel='linear')

# Train classifiers
nb_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

# Predict sentiment using classifiers and calculate classification report
nb_classification_report = classification_report(y_test, nb_classifier.predict(X_test))
svm_classification_report = classification_report(y_test, svm_classifier.predict(X_test))

# Print classification report for Naive Bayes
print("\nClassification Report for Naive Bayes:")
print(nb_classification_report)

# Print classification report for SVM
print("\nClassification Report for SVM:")
print(svm_classification_report)


Classification Report for Naive Bayes:
              precision    recall  f1-score   support

           0       0.79      0.30      0.44       494
           1       0.83      0.98      0.90      1706

    accuracy                           0.83      2200
   macro avg       0.81      0.64      0.67      2200
weighted avg       0.82      0.83      0.79      2200


Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.61      0.61      0.61       494
           1       0.89      0.89      0.89      1706

    accuracy                           0.82      2200
   macro avg       0.75      0.75      0.75      2200
weighted avg       0.82      0.82      0.82      2200



# Lexicon

In [4]:
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report

# Read the CSV file
df = pd.read_csv('Reviews.csv', encoding='ISO-8859-1')

# Extract 'Text' and 'Score' columns, dropping any rows with missing values
data = df[['Score', 'Text']].dropna() 

# Convert 'Score' to corresponding sentiment labels
def score_to_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

data['Sentiment'] = data['Score'].apply(score_to_sentiment)

# Split data into features and labels
texts = data['Text']
labels = data['Sentiment']

# Lexicon-based sentiment analysis (without table)
tb_pred = []
vader_pred = []
for text in texts:
    blob = TextBlob(text)
    tb_polarity = blob.sentiment.polarity
    tb_label = 'positive' if tb_polarity > 0 else 'negative' if tb_polarity < 0 else 'neutral'
    tb_pred.append(tb_label)

    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(text)
    vader_compound = vs['compound']
    vader_label = 'positive' if vader_compound > 0.05 else 'negative' if vader_compound < -0.05 else 'neutral'
    vader_pred.append(vader_label)

# Print classification reports
print("\nClassification Report for TextBlob:")
print(classification_report(labels, tb_pred, target_names=['negative', 'neutral', 'positive']))

print("\nClassification Report for VADER:")
print(classification_report(labels, vader_pred, target_names=['negative', 'neutral', 'positive']))


Classification Report for TextBlob:
              precision    recall  f1-score   support

    negative       0.56      0.37      0.44       828
     neutral       0.12      0.02      0.04       428
    positive       0.83      0.95      0.88      4243

    accuracy                           0.79      5499
   macro avg       0.50      0.45      0.45      5499
weighted avg       0.73      0.79      0.75      5499


Classification Report for VADER:
              precision    recall  f1-score   support

    negative       0.61      0.40      0.48       828
     neutral       0.15      0.05      0.08       428
    positive       0.84      0.96      0.90      4243

    accuracy                           0.80      5499
   macro avg       0.53      0.47      0.48      5499
weighted avg       0.75      0.80      0.77      5499



# Machine Learning show higher accuracy compared to  Lexicon-based approach

## Machine Learning (Naive Bayes & SVM)
### Strengths:
- Generally more accurate.
- Adaptable – can be trained on specific datasets.
- Can capture complex patterns in language.
### Weaknesses:
- Requires labeled training data.
- Naive Bayes can be sensitive to class imbalances.


## Lexicon-Based (TextBlob & VADER)
### Strengths:
- Simpler to implement.
- Doesn't require training data.
- Works well for general sentiment analysis.
### Weaknesses:
- Less accurate than machine learning models.
- Struggles with sarcasm, irony, and context-dependent sentiment.
- Requires up-to-date lexicons for new words and phrases.