In [47]:
import csv
import pandas as pd
import numpy as np
import sklearn
import string
import json
import datetime

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [48]:
data = pd.read_csv('yelp_reviews.csv')

In [49]:
data = data[['user_id', 'business_id', 'stars_x', 'text', 'date', 'name_x', 'name_y', 'stars_y', 'review_count_y', 'is_open', 'attributes', 'categories', 'hours']].rename(columns={'stars_x':'review_rating', 'name_x':'user_name', 'name_y':'business_name', 'stars_y':'business_rating', 'review_count_y':'num_reviews'})

In [50]:
trainData, testData = sklearn.model_selection.train_test_split(data,test_size = .25,train_size = .75)

## Clean Review Text

In [51]:
#Function to clean review text
#Returns string of cleaned words
def clean_text(text):
    punct = string.punctuation
    text = text.replace("\n", ' ').replace("\t", ' ').lower().strip()
    text = [c for c in text if not (c in punct)]
    text = ''.join(text)
    return text.strip().replace("  ",' ')

In [52]:
# Clean up the text data
data['text'] = data.text.apply(clean_text)

## ML Pipeline: Text Preprocessing + Regression


In [47]:
text_reg = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 4))),
    ('tfidf', TfidfTransformer()),
    ('ridge', Ridge(alpha = 1.5)),
])

In [48]:
text_reg.fit(trainData.text, trainData.review_rating)

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 4))),
                ('tfidf', TfidfTransformer()), ('ridge', Ridge(alpha=1.5))])

In [49]:
preds = text_reg.predict(testData.text)
preds[:10]

array([3.65292101, 1.72560006, 1.64344974, 3.8674253 , 1.6839272 ,
       5.85893053, 4.26804874, 5.57323894, 4.44188238, 3.87413115])

In [50]:
mean_squared_error(testData.review_rating, preds)

0.658704641319846

In [None]:
# With the pipeline 0.658704641319846