In [12]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm

%matplotlib inline

In [23]:
%matplotlib inline

In [13]:
dataset = pd.read_json('./data/dataset.json')
dataset.head()

Unnamed: 0,name,business_id,text,review stars,business stars,cool
0,Secret Pizza,iCQpiavjjPzJ5_3gPD5Ebg,The pizza was okay. Not the best I've had. I p...,2,4.0,0
1,Leticia's Mexican Cocina,pomGBqfbxcqPv14c3XH-ZQ,I love this place! My fiance And I go here atl...,5,4.0,0
10,National Car Rental,yFumR3CWzpfvTH2FCthvVw,I have been an Emerald Club member for a numbe...,5,4.0,0
100,Other Mama,7wHLFohwCw8l6WS-feLjeg,I had an amazing time here. They were very bus...,5,4.5,1
1000,Noodle Pot,5Lcit9Zt6TF9bmKoFMhF0g,"Terrible service period, since we don't speak ...",1,4.0,0


In [14]:
X = dataset['text'].tolist()
y = dataset['business stars'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [24]:
_pipe = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english',lowercase=True,analyzer='word',token_pattern=r'\w+')),
    ('linear', LinearRegression())
])


_pipe.fit(X_train, y_train)
predict_linear = _pipe.predict(X_test)

In [25]:
print(mean_squared_error(y_test,predict_linear))

1.4095771769238785


In [26]:
_pipe_knr = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english',lowercase=True,analyzer='word',token_pattern=r'\w+')),
    ('KNR', KNeighborsRegressor(n_neighbors=5))
])
_pipe_knr.fit(X_train, y_train)
predict_knr = _pipe_knr.predict(X_test)

In [27]:
print(mean_squared_error(y_test,predict_knr))

1.2282103433762381


## Experiments using GridSearch

In [5]:
# Diminuindo dataset para GridSearch

n_df = dataset.sample(5000, replace=True)
n_X_train = n_df['text']
n_y_train = n_df['review stars']

In [6]:
parameters = {'KNR__n_neighbors': (3, 4, 5, 6, 7, 8, 9) ,
              'KNR__weights': ('uniform', 'distance'),
              'KNR__leaf_size': (10, 20, 30, 40)
             }

_pipe_knr = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english',lowercase=True,analyzer='word',token_pattern=r'\w+')),
    ('KNR', KNeighborsRegressor())
])

gs_knr = GridSearchCV(_pipe_knr, parameters, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
gs_knr.fit(n_X_train, n_y_train)
print('Best score :' + str(gs_knr.best_score_))
print('Best params :' + str(gs_knr.best_params_))

Best score :-1.5395214690797348
Best params :{'KNR__leaf_size': 10, 'KNR__n_neighbors': 9, 'KNR__weights': 'distance'}


In [10]:
_pipe_knr = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english',lowercase=True,analyzer='word',token_pattern=r'\w+')),
    ('KNR', KNeighborsRegressor(n_neighbors=9, leaf_size=10, weights='distance'))
])
_pipe_knr.fit(n_X_train, n_y_train)
predict_knr = _pipe_knr.predict(X_test)
print(mean_squared_error(y_test,predict_knr))

0.7963976416797578
