In [1]:
import csv
import pandas as pd
import numpy as np
import sklearn
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

## Import Data


In [2]:
data = pd.read_csv('data/yelp_reviews.csv')

In [3]:
data.head(1)

Unnamed: 0,review_id,user_id,business_id,stars_x,useful_x,funny_x,cool_x,text,date,name_x,...,state,postal_code,latitude,longitude,stars_y,review_count_y,is_open,attributes,categories,hours
0,d6GD13VfPuCqSYWqOfkwbQ,-TCvSRyBCbKqJ7QPYt2JMw,g8OnV26ywJlZpezdBnOWUQ,4.0,0,0,0,"This place is awesome. Gigantic portion, frie...",2010-08-05 06:29:08,Keith,...,NV,89146,36.144713,-115.240092,4.0,3463,1,"{'RestaurantsAttire': ""'casual'"", 'BusinessAcc...","American (New), Restaurants, Breakfast & Brunch","{'Monday': '7:30-21:0', 'Tuesday': '7:30-21:0'..."


In [4]:
data.shape

(500000, 43)

### Split Data

In [6]:
trainData, testData = sklearn.model_selection.train_test_split(data,test_size = .20,train_size = .80)

## Clean Review Text

In [5]:
#Function to clean review text
#Returns string of cleaned words
def clean_text(text):
    punct = string.punctuation
    text = text.replace("\n", ' ').replace("\t", ' ').lower().strip()
    text = [c for c in text if not (c in punct)]
    text = ''.join(text)
    return text.strip().replace("  ",' ')

In [7]:
# Useful data structures
train_texts = []
test_texts = []

In [8]:
#Populate data structures

#Clean train texts
for i in range(len(trainData)):
    row = trainData.iloc[i]
    text = row.text
    train_texts.append(clean_text(text))

#Clean test texts
for i in range(len(testData)):
    row = testData.iloc[i]
    text = row.text
    test_texts.append(clean_text(text))

In [9]:
# stars_x is the review star rating. We will be predicting that
y_train = [trainData.iloc[i].stars_x for i in range(len(trainData))]

In [10]:
y_test = [testData.iloc[i].stars_y for i in range(len(testData))]

#### Implement TFIDF Word Vectorizor and apply Ridge Regression

In [11]:
#Training
vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True,stop_words='english')
X = vectorizer.fit_transform(train_texts)
y = y_train

In [12]:
reg = Ridge(alpha = 1.5)
reg.fit(X,y)

Ridge(alpha=1.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [13]:
y_preds = reg.predict(X)

In [14]:
mean_squared_error(y, y_preds)

0.5824477310251903

In [15]:
# Testing on test data
testData = vectorizer.transform(test_texts)

In [16]:
y_test_preds = reg.predict(testData)

In [17]:
mean_squared_error(y_test, y_test_preds)

1.278195377547233