In [6]:
#Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import seaborn as sns
from time import time
import gensim
import random
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline 



In [7]:

file_path = 'Reviews.csv'

# Load data from CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [8]:
#Filter rows where HelpfulnessDenominator is 0 and calculate the resulting DataFrame shape.
df[df.HelpfulnessDenominator == 0].shape


(270052, 10)

In [9]:
#Get information about the number of data points in the dataset
print("Number of datapoints: ",df.shape[0])
print("Number of features: ",df.shape[1])

Number of datapoints:  568454
Number of features:  10


In [26]:
# Define the number of reviews to extract from each rating class, to balance the classes, there is a strong imbalance in data. 
num_reviews_per_class = 29770

# Create empty list to store balanced data
balanced_data = []

# Iterate through each rating class
for score in range(1, 6):
    # Filter reviews for the current rating class
    reviews_for_score = df[df['Score'] == score]
    
    # Select a fixed number of reviews from the current rating class
    selected_reviews = reviews_for_score.head(num_reviews_per_class)
    
    # Append selected reviews to the balanced data list
    balanced_data.append(selected_reviews)

# Concatenate the balanced data DataFrames
balanced_data = pd.concat(balanced_data, ignore_index=True)

# Optional: Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

# Display the balanced dataset
print("Balanced Dataset:")
print(balanced_data.head())
print("Number of data points in our data:", len(balanced_data))



Balanced Dataset:
       Id   ProductId          UserId ProfileName  HelpfulnessNumerator  \
0  222531  B004E4CCSQ   AUITG1DJ3QUGK    E. Swope                     0   
1  334856  B000EDGB2E  A366ISPASB3NHK   MomMcDuck                     0   
2   56482  B0000VYKZA   AM4UVNWCW9NJE  iansomniak                     0   
3   73054  B0000CGE3R  A2E0VRQ9ERB2V6    N. Brede                     1   
4    3437  B005K4Q1VI  A1Y5J68F22DRUR  Allison H.                     1   

   HelpfulnessDenominator  Score        Time  \
0                       0      3  1328140800   
1                       0      3  1348704000   
2                       2      2  1192233600   
3                       1      3  1151971200   
4                       2      3  1322611200   

                                             Summary  \
0  Convenient, green, not bad tasting, but nutrit...   
1                                       Disappointed   
2   Well, if this doesn't wake you up, NOTHING will.   
3                 

In [24]:
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
actual_score = balanced_data['Score']
positiveNegative = actual_score.map(partition) 
balanced_data['Score'] = positiveNegative

print("Number of data points in our data:", balanced_data.shape)
print(balanced_data.head(3))



Number of data points in our data: (148849, 10)
       Id   ProductId         UserId            ProfileName  \
0  202406  B000FDDOGE  AJ9LNP24VF4PF    Stepping Heavenward   
1   19313  B0030MZCN0  AIFGUCOUOOFJ3                  jilly   
2   48259  B004SRH2B6  AH7FLBGNDOBDP  Amber "the PlantNerd"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     0                       0      1  1313193600   
1                     0                       0      1  1341532800   
2                     2                       2      1  1309824000   

                                    Summary  \
0  Very happy with taste, but price went up   
1                  Best Earl Grey tea ever!   
2      One of the best Cocowaters available   

                                                Text  
0  My daughter and I have been on a 'wheat free' ...  
1  This is by far the best Earl Grey tea ever. Th...  
2  This is a really good product, the flavor is n...  
Total num

In [19]:
print("Number of data points in our data:", len(balanced_data))


Number of data points in our data: 148849


In [31]:
# Filter the balanced_data DataFrame

#sorting data into positive and negative data 
display = balanced_data[(balanced_data['Score'] != 3) & (balanced_data['UserId'] == "AR5J8UI46CURR")]

# Sort the filtered DataFrame by ProductID
display = display.sort_values(by='ProductId')

# Display the first few rows of the filtered DataFrame
print(display.head())


Empty DataFrame
Columns: [Id, ProductId, UserId, ProfileName, HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary, Text]
Index: []


In [37]:
# Sorting data according to ProductId in ascending order
sorted_data = balanced_data.sort_values(by='ProductId', ascending=True)

# Displaying the first 5 rows of the sorted DataFrame
print(sorted_data.head(5))


            Id   ProductId          UserId  \
4587    150523  0006641040  A2P4F2UO0UMP8C   
63086   150529  0006641040   A25ACLV5KPB4W   
126068  150497  0006641040  A1HKYQOFC8ZZCH   
21455   150528  0006641040   AGQWHPNDZGUD0   
94146   150527  0006641040  A367OXCD2R6MC1   

                                  ProfileName  HelpfulnessNumerator  \
4587    Elizabeth A. Curry "Lovely Librarian"                     0   
63086                     Matt Hetling "Matt"                     0   
126068             Maria Apolloni "lanarossa"                     2   
21455                                  Bolt81                     0   
94146           Laurel "I love the internet."                     0   

        HelpfulnessDenominator  Score        Time  \
4587                         0      4  1096675200   
63086                        1      4  1108425600   
126068                       2      1  1334707200   
21455                        1      3  1296864000   
94146                        1 

In [44]:
# Deduplication of entries
final = sorted_data.drop_duplicates(subset=["UserId", "ProfileName", "Time", "Text"], keep='first', inplace=False)

# Get the shape of the deduplicated DataFrame
print(final.shape)


(117061, 10)


Observation: Additionally, it was noted that in the two rows provided below, the HelpfulnessNumerator exceeds the HelpfulnessDenominator, which is not feasible in practice. Therefore, these two rows were also excluded from the calculations.

In [40]:
# Filter the DataFrame to select rows where the Score is not equal to 3 and the Id is either 44737 or 64422
display = df[(df['Score'] != 3) & ((df['Id'] == 44737) | (df['Id'] == 64422))]

# Sort the filtered DataFrame by ProductID
display = display.sort_values(by='ProductId')

# Display the first few rows of the filtered DataFrame
display.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
64421,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
44736,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [45]:
# Filter out rows where HelpfulnessNumerator is greater than HelpfulnessDenominator
final = final[final['HelpfulnessNumerator'] <= final['HelpfulnessDenominator']]

In [54]:
# Filter the balanced_data DataFrame for rows where 'Score' is equal to 1
score_1_reviews = balanced_data[balanced_data['Score'] == 3]

# Print the total number of reviews with score 1
total_score_1_reviews = len(score_1_reviews)
print("Total number of reviews with score 1:", total_score_1_reviews)


Total number of reviews with score 1: 29770


In [49]:
from sklearn.model_selection import train_test_split

# Split the balanced_data into features (X) and target variable (y)
X = balanced_data['Text']  # Features
y = balanced_data['Score']  # Target variable

# Step 1: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# Step 2: Preprocess the training data
# Apply tokenization, lowercasing, removing stopwords, etc. to X_train

# Step 3: Apply the same preprocessing steps to the testing data
# Ensure that the preprocessing steps are applied consistently to X_test

# Train your sentiment analysis model on X_train and y_train
# Evaluate the model on X_test and y_test to assess its performance


Shape of X_train: (119079,)
Shape of X_test: (29770,)
Shape of y_train: (119079,)
Shape of y_test: (29770,)
