In [53]:
import csv
import pandas as pd
import numpy as np
import sklearn
import string
import json
from collections import defaultdict
from sklearn import linear_model


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [3]:
data = pd.read_csv('data/yelp_reviews.csv')

In [4]:
#split data
trainData, testData = sklearn.model_selection.train_test_split(data)

In [5]:
#number of nan values in the attributes col
data.attributes.isnull().sum()

15050

In [6]:
#Function to clean review text
#Returns string of cleaned words
def clean_text(text):
    #return an empty string for a nan value
    if type(text) == float:
        return '' 
    punct = string.punctuation
    text = text.replace("\n", ' ').replace("\t", ' ').lower().strip()
    text = [c for c in text if not (c in punct)]
    text = ''.join(text)
    return text.strip().replace("  ",' ')

In [32]:
#populate list of attribute strings
train_attributes = trainData.attributes.apply(clean_text).reset_index(drop = True)
test_attributes = testData.attributes.apply(clean_text).reset_index(drop = True)

In [11]:
## stars_y is the business star rating. We will be predicting that
y_train = [trainData.iloc[i].stars_y for i in range(len(trainData))]
y_test = [testData.iloc[i].stars_y for i in range(len(testData))]

In [45]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for atr in train_attributes:
    ws = atr.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
    ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
    for w in ws + ws2 + ws3 + ws4 + ws5:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [46]:
words = [x[1] for x in counts[:1000]]

In [49]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [50]:
def feature(datum):
    feat = [0]*len(words)
    ws = datum.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
    ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
    for w in ws + ws2 + ws3 + ws4 + ws5:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [51]:
X = [feature(atr) for atr in train_attributes]
y = y_train

In [55]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [56]:
mean_squared_error(y_train, predictions)

0.45412757830695855

In [57]:
X_test = [feature(atr) for atr in test_attributes]
y_test = y_test

In [58]:
predictions_test = clf.predict(X_test)

In [59]:
mean_squared_error(y_test, predictions_test)

0.4558486969476772