In [None]:
!pip install py3langid

In [None]:
import pandas as pd
import nltk
import py3langid as langid
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.probability import FreqDist
import pickle

import numpy as np
import matplotlib.pyplot as plt

In [None]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("names")
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('tagsets')
nltk.download('tagsets_json')
nltk.download('punkt_tab')

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
names = nltk.corpus.names.words()

In [None]:
df = pd.read_csv("restaurant_reviews.csv")
df

In [None]:
df = df[df["Rating"] != "Like"]

In [None]:
df = df[["Review", "Rating", "Time"]]

In [None]:
df["Rating"] = df["Rating"].astype("float64")

In [None]:
df = df[~df["Review"].isna()]

In [None]:
df.info()

In [None]:
df["Rating"].hist()

In [None]:
df["Time"] = pd.to_datetime(df["Time"])

In [None]:
df["Hour"] = df["Time"].dt.hour

In [None]:
df["Weekday"] = df["Time"].dt.weekday

In [None]:
df["Month"] = df["Time"].dt.month

In [None]:
df

In [None]:
df.boxplot("Rating", by="Month")

In [None]:
df["Language"] = df["Review"].apply(lambda x: langid.classify(x)[0])
df

In [None]:
df = df[df["Language"] == "en"]
df

In [None]:
df["Rank"] = df["Rating"].rank(method="first")

In [None]:
df["Quartile"] = pd.qcut(df["Rank"], 4, labels=[1, 2, 3, 4])

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
tokens_unwanted = []

In [None]:
def get_tokens(text):

  tokens = word_tokenize(text)
  cleaned_tokens = []

  for token in tokens:
    if token in stopwords: continue
    if token in names: continue
    if not token.isalpha(): continue
    if token in tokens_unwanted: continue
    token = token.lower()
    token = lemmatizer.lemmatize(token)
    cleaned_tokens.append(token)

  cleaned_tokens = [i[0] for i in pos_tag(cleaned_tokens) if i[1][:2] in ["NN", "VB", "RB", "JJ"]]

  return cleaned_tokens


In [None]:
full_vocabulary = []
vocabulary = {category:[] for category in df["Quartile"].unique()}
for _, row in df.iterrows():
  tokens = get_tokens(row["Review"])
  full_vocabulary += tokens
  vocabulary[row["Quartile"]] += tokens

In [None]:
FreqDist(full_vocabulary).most_common(20)

In [None]:
FreqDist(vocabulary[4]).most_common(20)

In [None]:
most_common_tokens = set()

for quartile in df["Quartile"].unique():
  most_common_quartile_tokens = [i[0] for i in FreqDist(vocabulary[quartile]).most_common(250)]
  most_common_tokens = most_common_tokens.union(set(most_common_quartile_tokens))

In [None]:
print(len(list(most_common_tokens)))

In [None]:
with open(f"most_common.tokens", "wb") as file:
  pickle.dump(most_common_tokens, file)

In [None]:
def get_tokens_training(text):

  tokens = word_tokenize(text)
  cleaned_tokens = []

  for token in tokens:
    if token in stopwords: continue
    if token in names: continue
    if not token.isalpha(): continue
    token = token.lower()
    token = lemmatizer.lemmatize(token)
    if token not in most_common_tokens: continue
    cleaned_tokens.append(token)

  return " ".join(cleaned_tokens)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = df.reset_index(drop=True)

In [None]:
df["Tokens String"] = df["Review"].apply(lambda x: get_tokens_training(x))
df

In [None]:
vectorizer = TfidfVectorizer(vocabulary=most_common_tokens)
X = vectorizer.fit_transform(df["Tokens String"])

In [None]:
with open("vectorizer.pkl", "wb") as file:
  pickle.dump(vectorizer, file)

In [None]:
y = df["Rating"]

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train

In [None]:
model = AdaBoostRegressor()

In [None]:
model.fit(X_train.toarray(), y_train)

In [None]:
model.predict(X_test.toarray())

In [None]:
mean_absolute_error(y_test, model.predict(X_test.toarray()))

# MAE = 1.19 then in average model garantees to differ by +- 1.19
# For example: 5 has a lower bound 3.81 and an upper bound 6.19

In [None]:
mean_squared_error(y_test, model.predict(X_test.toarray()))

In [None]:
r2_score(y_test, model.predict(X_test.toarray()))

In [None]:
with open("model.pkl", "wb") as file:
  pickle.dump(model, file)

##"Modelo en producción"

In [None]:
import pandas as pd
import nltk
import py3langid as langid
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.probability import FreqDist
import pickle
from sklearn.ensemble import AdaBoostRegressor

In [None]:
with open("model.pkl", "rb") as file:
  loaded_model = pickle.load(file)

with open("vectorizer.pkl", "rb") as file:
  loaded_vectorizer = pickle.load(file)

with open("most_common.tokens", "rb") as file:
  loaded_mct = pickle.load(file)

In [None]:
nltk.download("stopwords")
nltk.download("names")

In [None]:
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")
names = nltk.corpus.names.words()

In [None]:
def get_tokens_predict(text):

  tokens = word_tokenize(text)
  cleaned_tokens = []

  for token in tokens:
    if token in stopwords: continue
    if token in names: continue
    if not token.isalpha(): continue
    token = token.lower()
    token = lemmatizer.lemmatize(token)
    if token not in loaded_mct: continue
    cleaned_tokens.append(token)

  return " ".join(cleaned_tokens)


In [None]:
predict_df = pd.read_csv("restaurant_reviews.csv")

In [None]:
predict_df = predict_df[~predict_df["Review"].isna()]
predict_df = predict_df.reset_index()

In [None]:
predict_df["Tokens String"] = predict_df["Review"].apply(lambda x: get_tokens_predict(x))

In [None]:
X = loaded_vectorizer.transform(predict_df["Tokens String"])

In [None]:
predictions = loaded_model.predict(X.toarray())

In [None]:
pd.concat([predict_df, pd.Series(predictions, name="Prediction")], axis=1)