# Data Exploration and NLP Modeling 
## By BROSSEAU Alexandre & COGORDAN Alexandre

In [1]:
import nltk
import requests
import time
import pandas as pd
import os
import re
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import streamlit as st
import numpy as np
import tensorflow as tf
import tensorboard as tb

from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from collections import Counter
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from dotenv import find_dotenv, load_dotenv
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec
from sklearn.manifold import TSNE
from tensorboard.plugins import projector

load_dotenv()

/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/huggingface_hub/inference/_text_generation.py:121: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  @validator("best_of")
/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/huggingface_hub/inference/_text_generation.py:140: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  @validator("repetition_penalty")
/Users/alexandrecogorda

True

## Web scraping

### We get the requests and the dataframe we've created so far

In [2]:
df = pd.read_csv('yelp_reviews.csv', usecols=['text', 'rating', 'location'])

### We call our API key to start web scraping

In [3]:
api_key = os.getenv('YELP_API_KEY')
headers = {'Authorization': 'Bearer ' + api_key}

### We get the businesses' IDs

In [4]:
def get_all_business_data(base_url):
    
    #all_business_ids = []
    list_of_businesses = []

    while True:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            break  

        data = response.json()
        businesses = data.get('businesses', [])

        # business_info = data.get('businesses')

        if not businesses:
            break  # Break the loop if no more businesses are returned

        for business in businesses:
            #business_id = business.get('id') # 'name', 'price', 'url', 'review_count', 'display_address', 'image_url', 'display_phone', 'categories'
            business_dict = {'business_id': business.get('id'), 'business_name': business.get('name'), 'business_price': business.get('price'), 'business_url': business.get('url'), 'business_review_count': business.get('review_count'), 'business_display_address': business.get('location').get('display_address'), 'business_image_url': business.get('image_url'), 'business_display_phone': business.get('display_phone'), 'business_categories': business.get('categories')}
            list_of_businesses.append(business_dict)
            #if business_id:
                #all_business_ids.append(business_id)

        # Update the offset in the URL for the next request
        if 'offset=' in base_url:
            base_url = base_url.rsplit('offset=', 1)[0] + f'offset={len(list_of_businesses)}'
        else:
            base_url += f'&offset={len(list_of_businesses)}'

        time.sleep(1)  

    return list_of_businesses


In [9]:
# business_data = get_all_business_data('https://api.yelp.com/v3/businesses/search?location=New%20York&limit=50')

# business_df = pd.DataFrame(business_data)

# business_data = get_all_business_data('https://api.yelp.com/v3/businesses/search?location=New%20York&limit=50')

cities = ['San+Diego', 'San+Jose', 'Seattle', 'Maryville', 'Salt+Lake+City', 'Oklahoma+City', 'Austin', 'Louisville', 'Indianapolis', 'Portland', 'Santa+Cruz', 'Jacksonville', 'Hudson', 'Dallas', 'Phoenix', 'Asheville']

for city in cities:
    url = (f'https://api.yelp.com/v3/businesses/search?location={city}&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')
    business_data = get_all_business_data(url)
    business_df = pd.concat([business_df, pd.DataFrame(business_data)])
    print(len(business_data))
    if(city is 'San+Diego'):
        break

### We get the reviews from the business

In [6]:
def get_reviews(restaurant_ids, city):
    list_of_reviews = []
    count = 0
    
    for i in range(len(restaurant_ids)):
        url2 = "https://api.yelp.com/v3/businesses/" + restaurant_ids[0][i] + "/reviews?sort_by=yelp_sort"
        response = requests.get(url2, headers=headers)
        reviews_data = response.json()
        
        try:
            for review in reviews_data['reviews']:
                review_dict = {'text': review['text'], 'rating': review['rating'],'location':city, }
                list_of_reviews.append(review_dict)
                count += 1
                
                # We limit the number of reviews to 25 reviews per restaurant

                if count == 25: 
                    return list_of_reviews
        except:
            print("No reviews for this restaurant")
        
    return list_of_reviews

#### New Orleans

In [7]:
import requests

new_orleans_url = ('https://api.yelp.com/v3/businesses/search?location=New+Orleans&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

new_orleans_restaurants_data = get_all_business_data(new_orleans_url)

new_orleans_restaurants_ids = [restaurant['business_id'] for restaurant in new_orleans_restaurants_data]

new_orleans_list_of_reviews = get_reviews(new_orleans_restaurants_ids,'New Orleans')

print(len(new_orleans_list_of_reviews))

No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
0


#### New York City

In [8]:
nyc_url = ('https://api.yelp.com/v3/businesses/search?location=New+York+City&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

nyc_restaurants_data = get_all_business_data(nyc_url)

nyc_restaurants_ids = [restaurant['business_id'] for restaurant in nyc_restaurants_data]

nyc_list_of_reviews = get_reviews(nyc_restaurants_ids,'New York City')

print(len(nyc_list_of_reviews))

No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant


KeyboardInterrupt: 

#### Chicago

In [135]:
chicago_url = ('https://api.yelp.com/v3/businesses/search?location=Chicago&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

chicago_restaurants_data = get_all_business_data(chicago_url)

chicago_restaurants_ids = [restaurant['business_id'] for restaurant in chicago_restaurants_data]   

chicago_list_of_reviews = get_reviews(chicago_restaurants_ids,'Chicago')

print(len(chicago_list_of_reviews))

25


#### Los Angeles

In [136]:
los_angeles_url = "https://api.yelp.com/v3/businesses/search?location=Los+Angeles&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

los_angeles_restaurants_data = get_all_business_data(los_angeles_url)

los_angeles_restaurants_ids = [restaurant['business_id'] for restaurant in los_angeles_restaurants_data]

los_angeles_list_of_reviews = get_reviews(los_angeles_restaurants_ids,'Los Angeles')

print(len(los_angeles_list_of_reviews))

25


#### San Francisco

In [137]:
sf_url = "https://api.yelp.com/v3/businesses/search?location=San+Francisco&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

san_francisco_restaurants = get_all_business_data(sf_url)

san_francisco_restaurants_ids = [restaurant['business_id'] for restaurant in san_francisco_restaurants]

sf_list_of_reviews = get_reviews(san_francisco_restaurants_ids,'San Francisco')

print(len(sf_list_of_reviews))

25


#### Philadelphia

In [138]:
philadelphia_url = "https://api.yelp.com/v3/businesses/search?location=Philadelphia&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

philadelphia_restaurants = get_all_business_data(philadelphia_url)

philadelphia_restaurants_ids = [restaurant['business_id'] for restaurant in philadelphia_restaurants]

philadelphia_list_of_reviews = get_reviews(philadelphia_restaurants_ids,'Philadelphia')

print(len(philadelphia_list_of_reviews))

24


#### Las Vegas

In [139]:
las_vegas_url = "https://api.yelp.com/v3/businesses/search?location=Las+Vegas&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

las_vegas_restaurants = get_all_business_data(las_vegas_url)

las_vegas_restaurants_ids = [restaurant['business_id'] for restaurant in las_vegas_restaurants]

las_vegas_list_of_reviews = get_reviews(las_vegas_restaurants_ids,'Las Vegas')

print(len(las_vegas_list_of_reviews))

25


#### Houston

In [140]:
houston_url = "https://api.yelp.com/v3/businesses/search?location=Houston&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

houston_restaurants = get_all_business_data(houston_url)

houston_restaurants_ids = [restaurant['business_id'] for restaurant in houston_restaurants]

houston_list_of_reviews = get_reviews(houston_restaurants_ids,'Houston')

print(len(houston_list_of_reviews))

25


#### Phoenix

In [141]:
phoenix_url = "https://api.yelp.com/v3/businesses/search?location=Phoenix&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

phoenix_restaurants = get_all_business_data(phoenix_url)

phoenix_restaurants_ids = [restaurant['business_id'] for restaurant in phoenix_restaurants]

phoenix_list_of_reviews = get_reviews(phoenix_restaurants_ids,'Phoenix')

print(len(phoenix_list_of_reviews))

12


#### Miami

In [142]:
miami_url = "https://api.yelp.com/v3/businesses/search?location=Miami&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

miami_restaurants = get_all_business_data(miami_url)

miami_restaurants_ids = [restaurant['business_id'] for restaurant in miami_restaurants]

miami_list_of_reviews = get_reviews(miami_restaurants_ids,'Miami')

print(len(miami_list_of_reviews))

15


### Merge

In [153]:
ouput_dfs = []

cities = ['new_orleans', 'nyc', 'chicago', 'los_angeles', 'sf', 'philadelphia', 'las_vegas', 'houston', 'phoenix', 'miami']

for city in cities:
    reviews_list = globals()[f'{city}_list_of_reviews']
    ouput_df = pd.DataFrame(reviews_list, columns=['text', 'rating', 'location'])
    ouput_dfs.append(ouput_df)

output = pd.concat(ouput_dfs, ignore_index=True)
df = pd.concat([df, output], ignore_index=True)

In [157]:
df.drop_duplicates(inplace=True)
df['rating'].value_counts()

rating
5    367
4    134
3     69
2     29
1     19
Name: count, dtype: int64

In [158]:
df.to_csv('pre-yelp_reviews.csv', index=False)
df

Unnamed: 0,text,rating,location
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles
...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix
614,The culinary journey begins right at your tabl...,5,Miami
615,"Very nice ambiance. We went there at night, an...",4,New York City
616,M. whatever ... this is a hard pass.... I know...,1,New York City
