Getting data for each review

In [128]:
data_path = 'finefoods_training.txt'

with open(data_path, 'r', encoding='utf-8', errors='ignore') as file:
    reviews = file.read().split('\n\n')

Creating a dictionary for each review

In [129]:
review_data = []

for review in reviews:
    review_info = {}
    lines = review.split('\n')
    for line in lines:
        if ': ' not in line:
          continue
        s = line.split(': ', 1)
        key = s[0]
        value = s[1]
        review_info[key] = value
    review_data.append(review_info)

Creating a DataFrame from all reviews

In [4]:
import pandas as pd

In [130]:
df = pd.DataFrame(review_data)

Cleaning up the DataFrame

In [6]:
import math

In [131]:
for index, row in df.iterrows():
  if math.isnan(float(row['review/time'])):
    df = df.drop(index)

df = df.reset_index(drop=True)

new_headers = ['Product ID', 'User ID', 'Profile Name', 'Helpfulness', 'Score', 'Time', 'Summary', 'Text']

df.columns = new_headers

In [132]:
for index, row in df.iterrows():
  df.at[index, 'Score'] = int(float(row['Score']))

Hashing function

In [8]:
import hashlib as hl

def hash_user_id(user_id):
    return int(hl.sha256(user_id.encode('utf-8')).hexdigest(), 16) % 10**8

Before trying a NLP model, seeing if the length of the review has any effect

In [135]:
def numerize_df(df):
  for index, row in df.iterrows():
    df.at[index, 'Hashed ID'] = hash_user_id(df.at[index, 'User ID'])

    if '"' in row['Profile Name']:
      df.at[index, 'Has Nickname'] = 1
    else:
      df.at[index, 'Has Nickname'] = 0

    df.at[index, 'Summary Length'] = len(row['Summary'])
    df.at[index, 'Text Length'] = len(row['Text'])

  return df

In [136]:
df = numerize_df(df)

In [138]:
columns_for_numeric = ['Score', 'Hashed ID', 'Has Nickname', 'Time', 'Summary Length', 'Text Length']
numeric_df = df[columns_for_numeric]

numeric_df = numeric_df.astype(float)

In [139]:
orig_numeric_df = numeric_df.copy()

columns_to_edit = ['Hashed ID', 'Has Nickname', 'Time', 'Summary Length', 'Text Length']

for column in columns_to_edit:
  lowest = min(numeric_df[column])

  numeric_df[column] -= lowest

In [140]:
numeric_df

Unnamed: 0,Score,Hashed ID,Has Nickname,Time,Summary Length,Text Length
0,5.0,92583352.0,0.0,342144000.0,20.0,206.0
1,1.0,89551015.0,0.0,385257600.0,16.0,133.0
2,4.0,68815001.0,1.0,257299200.0,20.0,452.0
3,2.0,2470041.0,0.0,346204800.0,13.0,162.0
4,5.0,9853436.0,1.0,389059200.0,10.0,83.0
...,...,...,...,...,...,...
1995,5.0,78773042.0,0.0,304128000.0,29.0,362.0
1996,5.0,32593894.0,1.0,318211200.0,29.0,1673.0
1997,4.0,80049914.0,1.0,288921600.0,13.0,218.0
1998,4.0,82034771.0,0.0,304041600.0,26.0,388.0


Performing Naive Bayes on the numeric data

In [142]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

target_column = 'Score'

X = numeric_df.drop(target_column, axis=1)
y = numeric_df[target_column]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

gnb = GaussianNB()

# Testing
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total of %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total of 400 points: 135


This clearly isn't very accurate.  Let's try scaling the features to have the same mean and variance

In [143]:
scaled_df = pd.DataFrame()
scaled_df['Score'] = numeric_df['Score']

for column in columns_to_edit:
  mean = numeric_df[column].mean()
  std = numeric_df[column].std()

  scaled_df[column] = numeric_df[column] - mean
  scaled_df[column] /= std

In [144]:
scaled_df

Unnamed: 0,Score,Hashed ID,Has Nickname,Time,Summary Length,Text Length
0,5.0,1.485565,-0.594911,0.254615,-0.128544,-0.340943
1,1.0,1.379774,-0.594911,1.115581,-0.408368,-0.520830
2,4.0,0.656344,1.680083,-1.439712,-0.128544,0.265252
3,2.0,-1.658272,-0.594911,0.335708,-0.618236,-0.449368
4,5.0,-1.400683,1.680083,1.191498,-0.828104,-0.644040
...,...,...,...,...,...,...
1995,5.0,1.003756,-0.594911,-0.504554,0.501060,0.043474
1996,5.0,-0.607323,1.680083,-0.223316,0.501060,3.274048
1997,4.0,1.048303,1.680083,-0.808221,-0.618236,-0.311372
1998,4.0,1.117550,-0.594911,-0.506279,0.291192,0.107543


Trying Naive Bayes again

In [145]:
X = scaled_df.drop(target_column, axis=1)
y = scaled_df[target_column]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# Testing
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total of %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total of 400 points: 147


This is actually even less accurate.  Time to try using Natural Language Processing for the review summary and text

In [29]:
import spacy as sp
from textblob import TextBlob

First step is to define a function to analyze the sentiment of a string (to be used both for the review and title of the review).

In [74]:
nlp = sp.load('en_core_web_sm')

def analyze_sentiment(text):
  doc = nlp(text)

  lemmatized_text = " ".join([token.lemma_ for token in doc])

  blob = TextBlob(lemmatized_text)

  sentiment_score = blob.sentiment.polarity

  return sentiment_score

Generating a sentiment score for each review

In [146]:
for index, row in df.iterrows():
  df.at[index, 'Text Sentiment'] = analyze_sentiment(row['Text'])
  df.at[index, 'Summary Sentiment'] = analyze_sentiment(row['Summary'])

Getting mean and standard deviation of these new values in preparation for scaling

In [147]:
means = [df['Summary Sentiment'].mean(), df['Text Sentiment'].mean()]
stds = [df['Summary Sentiment'].std(), df['Text Sentiment'].std()]

Scaling the sentiment values

In [148]:
for index, row in df.iterrows():
  scaled_df.at[index, 'Summary Sentiment'] = (row['Summary Sentiment'] - means[0])/stds[0]
  scaled_df.at[index, 'Text Sentiment'] = (row['Text Sentiment'] - means[1])/stds[1]

Removing useless Summary and Text Length columns

In [150]:
new_scaled_df = scaled_df[['Score', 'Hashed ID', 'Has Nickname', 'Time', 'Summary Sentiment', 'Text Sentiment']]

In [152]:
X = new_scaled_df.drop(target_column, axis=1)
y = new_scaled_df[target_column]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# Testing
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total of %d points: %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total of 400 points: 121


**Conclusion**

Extracted Features:

-UserID: Nothing too helpful could be extracted from this; I simply hashed the text to get a numerical value.

-Profile Name: I used a boolean operator that returned 1 if the profile included a nickname, 0 otherwise.

-Time: No change made.


Predictive information from reviews:
I first tried using the character count for both the summary and review text, thinking that a longer review would indicate dissatisfaction with the product.  This did not lead to very accurate results, so I instead used a NLP model to determine the positive/negative sentiment of each review, with the help of SpaCY and TextBlob.