In [18]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline 
import seaborn as sns

For this sentiment analysis ,I have used the Amazon Customer Reviews Dataset. This dataset contains product reviews from Amazon and is widely used for sentiment analysis tasks. It is publicly available on the Amazon Customer Reviews Dataset page on the Amazon Web Services (AWS) website.

In [19]:
df= pd.read_csv('amazonreviews.tsv', sep='\t', dtype=str)
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [20]:
# size of the selected dataset
df.shape

(10000, 2)

# Pre-processing the data

In [21]:
#Removing null values
df.isnull().sum()
df.dropna(inplace=True)

#removing empty strings 
blanks = [] 
for i,lb,rv in df.itertuples():  
    if type(rv)==str:            
        if rv.isspace():         
            blanks.append(i)     
        
df.drop(blanks, inplace=True)

#split data-set to train and test
X=df['review']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Model 1 : Logistic Regression

In [22]:
#Training the model 
from sklearn.linear_model import LogisticRegression
lr_model=Pipeline([('tfidf', TfidfVectorizer(lowercase=False)),( 'clf',LogisticRegression(solver='lbfgs'))])
lr_model.fit(X_train,y_train)

predictions= lr_model.predict(X_test)
report = classification_report(y_test,predictions, output_dict=True)

df_report = pd.DataFrame(report).transpose().round(2)

cm = sns.light_palette("blue", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.84,0.88,0.86,1649.0
pos,0.87,0.83,0.85,1651.0
accuracy,0.85,0.85,0.85,0.85
macro avg,0.85,0.85,0.85,3300.0
weighted avg,0.85,0.85,0.85,3300.0


# Model 2 : Linear SVC

In [17]:
#training the model
my_model=Pipeline([('tfidf', TfidfVectorizer()),('classifier',LinearSVC())])
my_model.fit(X_train,y_train)

predictions= my_model.predict(X_test)
report = classification_report(y_test,predictions, output_dict=True)

df_report = pd.DataFrame(report).transpose().round(2)

cm = sns.light_palette("blue", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.86,0.89,0.87,1649.0
pos,0.89,0.85,0.87,1651.0
accuracy,0.87,0.87,0.87,0.87
macro avg,0.87,0.87,0.87,3300.0
weighted avg,0.87,0.87,0.87,3300.0


# Model 3 : Vader's Algorithm

In [26]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
import pandas as pd

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


In [32]:
# Initializing the Vader sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

true_labels = []
predicted_labels = []

for index, row in df.iterrows():
    text = row['review']
    true_sentiment = row['label']
    
    # Get the predicted sentiment using the Vader model
    scores = vader.polarity_scores(text)
    predicted_sentiment = 'pos' if scores['compound'] >= 0 else 'neg'
    
    # Append the true and predicted labels to the respective lists
    true_labels.append(true_sentiment)
    predicted_labels.append(predicted_sentiment)


# accuracy = accuracy_score(true_labels, predicted_labels)
# precision = precision_score(true_labels, predicted_labels, pos_label='pos')
# recall = recall_score(true_labels, predicted_labels, pos_label='pos')
# f1 = f1_score(true_labels, predicted_labels, pos_label='pos')



In [33]:
report = classification_report(true_labels,predicted_labels,output_dict=True)
df_report = pd.DataFrame(report).transpose().round(2)
cm = sns.light_palette("blue", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.86,0.52,0.64,5097.0
pos,0.64,0.91,0.75,4903.0
accuracy,0.71,0.71,0.71,0.71
macro avg,0.75,0.71,0.7,10000.0
weighted avg,0.75,0.71,0.7,10000.0


# Analysis of the three models

Logistic Regression:
Evaluation Metrics:
Accuracy: 85%
Precision: 87%
Recall: 85%
F1-score: 85%

Logistic Regression is a linear classification algorithm that estimates the probability of an input belonging to each class using a logistic function. It assumes a linear relationship between the input features and the log-odds of the output classes.
In the context of sentiment analysis, Logistic Regression learns the relationship between the input features (typically a bag-of-words representation of the text) and the sentiment labels (positive or negative). During training, it adjusts the model parameters to maximize the likelihood of the observed sentiment labels given the input features.

Linear SVC:
Evaluation Metrics:
Accuracy: 87%
Precision: 89%
Recall: 87%
F1-score: 88%

Linear SVC is a variant of Support Vector Machines (SVM) that uses a linear kernel. SVM is a powerful supervised learning algorithm for classification tasks, including sentiment analysis. The goal of SVM is to find an optimal hyperplane that separates data points of different classes with the largest margin. In the case of sentiment analysis, Linear SVC aims to find a linear decision boundary that separates positive and negative sentiment samples. During training, it identifies a subset of training samples, called support vectors, that are closest to the decision boundary.
The model then learns the optimal weights for the linear decision boundary, maximizing the margin between support vectors of different sentiment classes.

VADER's ALGORITHM:
Evaluation Metrics:
Accuracy: 70%
Precision: 64%
Recall: 91%
F1-score: 75%

VADER is a rule-based sentiment analysis algorithm specifically designed for sentiment analysis of social media text.
It utilizes a lexicon that assigns sentiment scores to words based on their semantic orientation (positive, negative, or neutral). VADER incorporates rules for handling negations, intensifiers, and punctuation to improve sentiment analysis accuracy.
The algorithm calculates a sentiment score for a given text based on the presence and intensity of positive and negative words.
VADER considers the overall sentiment of the text by combining the individual word scores and applying sentiment intensity modifiers.

Therefore , I conclude that implementing the Linear SVC model for sentiment analysis based on the evaluation reports. It outperforms the other two models in terms of accuracy, precision, recall, and F1-score. While effectively addressing high-dimensional feature spaces, linear SVC also provides good performance. In terms of assessment measures, it also outperforms Logistic Regression and VADER. Despite being computationally efficient, VADER does not outperform the other two models.
