In [11]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import Levenshtein
import openai
import dotenv
from numpy.random import choice
import time
import ast

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [29]:
def get_GPT(prompt, model="gpt-3.5-turbo", retries=5, backoff_factor=2, temp=0.7):
    for attempt in range(retries + 1):
        try:
            completion = openai.ChatCompletion.create(
                model=model,
                stop="\n",
                temperature=temp,
                messages=[
                    {"role": "system", "content": """Ignore all previous instructions.Your task is now to help label data. You must respond
with a list of -1, 0, and 1, for example [0, 1, 0, -1 ...1] that is the length of the list of vibes.
You will receive a restaurant name, and three reviews from Google Maps. Based on your
analysis of the reviews, you must consider each vibe in the list of vibes and assign it a
score, 1 if this vibe is applicable to the restaurant, 0 if it is not relevant or there is not
enough information to decide, and -1 if this vibe is especially untrue of this restaurant."""},
                    {"role": "user", "content": f"{prompt}"}
                ]                
            )
            return completion.choices[0].message.content
        except openai.error.RateLimitError as e:
            print("RateLimit :(")
            if attempt < retries:
                sleep_time = (backoff_factor ** attempt) 
                time.sleep(sleep_time)
            else:
                raise
        except openai.error.APIError as e:
            print(f"APIError: {e}")
            if attempt < retries:
                sleep_time = (backoff_factor ** attempt)
                time.sleep(sleep_time)
            else:
                raise
        except RequestException as e:
            print(f"RequestException: {e}")
            if attempt < retries:
                sleep_time = (backoff_factor ** attempt)
                time.sleep(sleep_time)
            else:
                raise

In [76]:
def format_prompt(row):
    review_string = ""
    for n, r in enumerate([x['text'] for x in ast.literal_eval(row['reviews'])]):
        r = r.replace("\n", " ")
        review_string +=(f"{n+1}) {r}\n")
    return f"DATA:\n{row['name']}\n\nTop 3 review previews:\n{review_string}\nList of vibes:\n[Quirkadelic, Rainbow, Artsy, Hipster, Cozy, Industrial, Exotic, Sleek, Nostalgic, Glamorous, Sophisticated, Funky, Ethnic, Underground, Spirited, Folksy, Edgy, Innovative, Sustainable, Cosmopolitan, Garden-inspired, Invigorating, Chic]\n\nVector of values for each vibe:"    

In [84]:
prompt = (format_prompt(row))
v = get_GPT(prompt, temp=0)
ast.literal_eval(v)

[0, -1, 0, -1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1]

In [82]:
vibevals = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]
vibelist = ["Quirkadelic", "Rainbow", "Artsy", "Hipster", "Cozy", "Industrial", "Exotic", "Sleek", "Nostalgic", "Glamorous", "Sophisticated", "Funky", "Ethnic", "Underground", "Spirited", "Folksy", "Edgy", "Innovative", "Sustainable", "Cosmopolitan", "Garden-inspired", "Invigorating", "Chic"]
for vibe, val in zip(vibelist, vibevals):
    print(vibe, val)

Quirkadelic 0
Rainbow 0
Artsy 0
Hipster 0
Cozy 1
Industrial 0
Exotic 0
Sleek 0
Nostalgic 0
Glamorous 0
Sophisticated 0
Funky 0
Ethnic 0
Underground 0
Spirited 1
Folksy 0
Edgy 0
Innovative 0
Sustainable 0
Cosmopolitan 1
Garden-inspired 0
Invigorating 0
Chic 0


In [3]:
os.listdir()

['.ipynb_checkpoints',
 'data_labeler_restaurant_reviews.ipynb',
 'data_reviews.xlsx']

In [7]:
df=pd.read_excel('data_reviews.xlsx',index_col=0)

In [30]:
df.iloc[0]

name                                                      Bilbao Berria
reviews               [{'id': 'NK-bOEiAbi1bfZXUk86Vwg', 'url': 'http...
total                                                               168
possible_languages                       ['en', 'de', 'es', 'fr', 'pt']
Name: 0, dtype: object

In [115]:
vectors = []
for row in df.iterrows():
    prompt = format_prompt(row[1])
    for attempt in range(5):
#         print(f"attempt {attempt+1}")
        temp = 0.5 - (attempt/10)
        response = get_GPT(prompt, model="gpt-4", temp=temp)
        try:
            v = ast.literal_eval(response)
            if len(v) == len(vibelist):
                break
        except:
            print(response)
    vectors.append(v)

In [116]:
df2 = pd.DataFrame(vectors, columns=vibelist)
df_dataout = pd.concat([df, df2], axis=1)

In [117]:
df_dataout.to_excel("output.xlsx", index=False)

In [119]:
df_dataout.sum()

name                  Bilbao BerriaLa Alcoba AzulMomoTerrace la Isab...
reviews               [{'id': 'NK-bOEiAbi1bfZXUk86Vwg', 'url': 'http...
total                                                              1659
possible_languages    ['en', 'de', 'es', 'fr', 'pt']['en', 'es', 'de...
Quirkadelic                                                           2
Rainbow                                                              -1
Artsy                                                                 2
Hipster                                                               9
Cozy                                                                 39
Industrial                                                           -2
Exotic                                                                5
Sleek                                                                 2
Nostalgic                                                             5
Glamorous                                                       