In [1]:
import pickle
import requests
import pandas as pd
import pandas_gbq
import re
from google.cloud import bigquery
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.utils import resample
from tfm_module.json_processing import review_df



# Cargamos el modelo entrenado
with open('modelo_entrenado_v1.pk1', 'rb') as f:
    model = pickle.load(f)

Reviews for training: 18917
                                                text  liked
1  I've taken a lot of spin classes over the year...    1.0
3  Wow!  Yummy, different,  delicious.   Our favo...    1.0
5  I am a long term frequent customer of this est...    0.0
6  Loved this tour! I grabbed a groupon and the p...    1.0
7  Amazingly amazing wings and homemade bleu chee...    1.0


In [2]:
review_df

Unnamed: 0,text,liked
1,I've taken a lot of spin classes over the year...,1.0
3,"Wow! Yummy, different, delicious. Our favo...",1.0
5,I am a long term frequent customer of this est...,0.0
6,Loved this tour! I grabbed a groupon and the p...,1.0
7,Amazingly amazing wings and homemade bleu chee...,1.0
...,...,...
29992,So glad Urban Village took our reservation for...,1.0
29994,I've eaten here several times but today was pr...,1.0
29995,The best sandwich in the Reno/Sparks area! I'v...,1.0
29998,This is the biggest joke of a club of all time...,0.0


In [3]:
# Hacemos undersampling para evitar el sesgo
balance = len(review_df[review_df['liked']==0])

df_majority = review_df[review_df['liked']==1]
df_minority = review_df[review_df['liked']==0]
df_majority_undersampled = resample(df_majority, replace=False, n_samples=balance, random_state=42)
review_df = pd.concat([df_majority_undersampled, df_minority])

In [4]:
# Reiniciamos el índice manualmente antes del próximo paso
review_df = review_df.reset_index(drop=True)

In [5]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()


reviews_procesadas=[]

for i in range(0,len(review_df)):
    text = review_df['text'][i]
    if text:
        Review = re.sub('[^a-zA-Z]',' ',review_df['text'][i])
        Review = Review.lower()
        Review = Review.split()
        Review = [ps.stem(word) for word in Review if word not in set(stopwords.words('english'))]
        Review = ' '.join(Review)
        reviews_procesadas.append(Review)
    
    else:
        reviews_procesadas.append('To discard')

In [6]:
# Sustituimos las reviews procesadas en el mismo dataframe de entrenamiento
review_df['text'] = reviews_procesadas
df_modificado = review_df

In [7]:
X = review_df['text']
y = review_df['liked']

In [8]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs', random_state = 12)
clf.fit(X_train_vect,y_train)

In [11]:
# Llave privada de YELP API
API_KEY = 'ZCiemO3dscUrsLb8INf1d3KaA6hZPEkhbmvCu6va162wz4c2G-04aiCGEux729ZetfonkA41i68V2AIsFMxhXR68sa0xteZQRmot_J8KhRjvU_KFRxVxurlgOuJGZHYx'
headers = {'Authorization': f'Bearer {API_KEY}'}
params = {'term': 'McDonalds', 'location': 'New York'}

url = 'https://api.yelp.com/v3/businesses/search'
response = requests.get(url, headers=headers, params=params)

businesses = response.json()['businesses']

business_ids = []

if businesses:
    x = 0
    for i in businesses:
        a = i['id']
        business_ids.append(a)
        x += 1
else:
    print("No businesses found for the search term.")
    
business_ids

['0l-l2j5NAIqtychRNs17Dw',
 'LR6nRVRoP0g_wqqlIE8Ynw',
 'PGbKlytumI2WaIefhtCALw',
 'B_HkncKcyD0ZL3LsHmnzqg',
 'Um1WimKhVLw3KB-OzRYfyQ',
 'YV0VaJgV2zotzI7XNUaMCQ',
 'oXVZ9OqrC55fp-3hDBvRCQ',
 'jPIZ3FR5LNcwPuUHi2Fe4g',
 'EW7kFuXTsXRQ4Kp7j6qxHw',
 '0Ao9422D82WgsiUbr_t_Qg',
 'in7QM4vNUTHVZIi7Outetg',
 'NBVvD__J9JjomKtdds6Gvg',
 'cLXIlxKwnvUPg8JoxdfNwg',
 'hcVN7B1tw4jvRTiORTFKjA',
 '_yInwn1jLMDuPaSuap-Jrg',
 'Tr942SPajn138V7ZDx7Ckw',
 'AiOb7w3Z9jS8XJdamgbU_A',
 'J8n4z6pqHt8tiHkAaA0JVg',
 'RbhDdQfSY4x_CM4HQDSHUA',
 'p4EQbVuLddcTrgw_K8n8OA']

In [12]:
url = 'https://api.yelp.com/v3/businesses/{}/reviews'

def get_reviews_dataframe(business_ids):
    dfs=[]
    for id in business_ids:
        params = {'sort_by': 'desc'}
        
        response = requests.get(url.format(id), headers=headers, params=params)
        reviews = response.json()['reviews']
        
        df = pd.DataFrame(reviews, columns=['id', 'text', 'rating', 'time_created', 'user'])
        
        dfs.append(df)
        
    return pd.concat(dfs, ignore_index=True)

In [13]:
reviews_df = get_reviews_dataframe(business_ids)

In [14]:
ps = PorterStemmer()

test = "Stay away from this place" 

def pre_p(a):
    p = re.sub('[^a-zA-Z]',' ',a)
    p = p.lower()
    p = p.split()
    p = [ps.stem(word) for word in p if word not in set(stopwords.words('english'))]
    p = ' '.join(p)
    ejemplo = vectorizer.transform([p])
    return ejemplo.toarray()

prediction =clf.predict_proba(pre_p(test))

print(prediction[0][1])


0.31240831406558495


In [15]:
reviews_df['pro_text'] = reviews_df['text'].apply(pre_p)

In [16]:
a = reviews_df['pro_text']
scores = []

for i in a:
    pred = clf.predict_proba(i)[0][1]
    scores.append(pred)
    
reviews_df['score'] = pd.Series(scores)

In [17]:
# Make every column string to be able to upload to bigquery without a problem

reviews_df['id'] = reviews_df['id'].astype(str)
reviews_df['text'] = reviews_df['text'].astype(str)
reviews_df['rating'] = reviews_df['rating'].astype(str)
reviews_df['time_created'] = reviews_df['time_created'].astype(str)
reviews_df['user'] = reviews_df['user'].astype(str)
reviews_df['score'] = reviews_df['score'].astype(str)
reviews_df = reviews_df.drop('pro_text', axis=1)


In [18]:
reviews_df = reviews_df

In [20]:
from google.cloud import bigquery
client = bigquery.Client()

dataset_id = 'predictions' # replace with your dataset ID
table_id = 'ms-gauss-test.predictions.predictions' # replace with your table ID

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)

job_config = bigquery.LoadJobConfig(
    schema = [
        bigquery.SchemaField('id', bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField('text', bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField('rating', bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField('time_created', bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField('user', bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField('score', bigquery.enums.SqlTypeNames.STRING),
    ],
    write_disposition = "WRITE_TRUNCATE"

)
job = client.load_table_from_dataframe(
    reviews_df, table_id, job_config=job_config)

job.result()


table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

Loaded 60 rows and 6 columns to ms-gauss-test.predictions.predictions
