In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re


In [40]:
#downloading stopwords from nltk library
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aritra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
stop_words = set(stopwords.words('English'))

In [42]:
# Loading the dataset
file_path = 'C:\\Users\\Aritra\\Desktop\\amazon_reviews\\amazon_reviews.csv'
df = pd.read_csv(file_path)

In [43]:
#checking the data
print(df.head())

   Unnamed: 0  reviewerName  overall  \
0           0           NaN      4.0   
1           1          0mie      5.0   
2           2           1K3      4.0   
3           3           1m2      5.0   
4           4  2&amp;1/2Men      5.0   

                                          reviewText  reviewTime  day_diff  \
0                                         No issues.  2014-07-23       138   
1  Purchased this for my device, it worked as adv...  2013-10-25       409   
2  it works as expected. I should have sprung for...  2012-12-23       715   
3  This think has worked out great.Had a diff. br...  2013-11-21       382   
4  Bought it with Retail Packaging, arrived legit...  2013-07-13       513   

   helpful_yes  helpful_no  total_vote  score_pos_neg_diff  \
0            0           0           0                   0   
1            0           0           0                   0   
2            0           0           0                   0   
3            0           0           0    

In [44]:
#Step 1: Creating a binary sentiment column from the overall column named 'overall'
#Ratings of 4 and 5 are considered positive(1), 1 and 2 are negative(0) and dropping 3 star reviews
df = df[df['overall'] != 3]
df['sentiment'] = df['overall'].apply(lambda rating: 1 if rating >= 4 else 0)

In [45]:
#step 2: Cleaning the text by preprocessing function
def preprocess_text(text):
    
    #Then converting the text to lowercase
    text = text.lower()
    
    #Removing special characters, numbers and punctuations
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    
    #Removing single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    #Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    #Removing stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text
    

In [48]:
#Checking for missing values
print(df['reviewText'].isnull().sum())  


1


In [52]:
#Filling NaN values in the 'reviewText' column with an empty string
df['reviewText'] = df['reviewText'].fillna('')


In [54]:
#Dropping rows where 'reviewText' is NaN
df = df.dropna(subset=['reviewText'])

#Applying the preprocessing function
df['cleaned_review'] = df['reviewText'].apply(preprocess_text)


In [57]:
#Step 3: Spiltting the data into training and testing data
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
#step 4: Converting the text data into TF and IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [59]:
#Step 5: Training the Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

In [60]:
#Step 6: Predicting and Evaluating the Model
y_pred = lr_model.predict(X_test_tfidf)

In [61]:
#Calculating the accuracy of model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9382198952879581


In [62]:
#Detailed classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.08      0.14        61
           1       0.94      1.00      0.97       894

    accuracy                           0.94       955
   macro avg       0.78      0.54      0.56       955
weighted avg       0.92      0.94      0.92       955

