# Data Exploration and NLP Modeling 
## By BROSSEAU Alexandre & COGORDAN Alexandre

In [90]:
import nltk
import requests
import time
import pandas as pd
import os
import re
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import streamlit as st
import numpy as np
import tensorflow as tf
import tensorboard as tb

from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from collections import Counter
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from dotenv import find_dotenv, load_dotenv
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec
from sklearn.manifold import TSNE
from tensorboard.plugins import projector

load_dotenv()

True

## Web scraping

### We get the requests and the dataframe we've created so far

In [91]:
df = pd.read_csv('yelp_reviews.csv', usecols=['text', 'rating', 'location'])

### We call our API key to start web scraping

In [92]:
api_key = os.getenv('YELP_API_KEY')
headers = {'Authorization': 'Bearer ' + api_key}

### We get the businesses' IDs

In [93]:
def get_all_business_data(base_url):
    
    #all_business_ids = []
    list_of_businesses = []

    while True:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            break  

        data = response.json()
        businesses = data.get('businesses', [])

        # business_info = data.get('businesses')

        if not businesses:
            break  # Break the loop if no more businesses are returned

        for business in businesses:
            #business_id = business.get('id') # 'name', 'price', 'url', 'review_count', 'display_address', 'image_url', 'display_phone', 'categories'
            business_dict = {'restaurant_id': business.get('id'), 'business_name': business.get('name'), 'business_price': business.get('price'), 'business_url': business.get('url'), 'business_review_count': business.get('review_count'), 'business_display_address': business.get('location').get('display_address'), 'business_image_url': business.get('image_url'), 'business_display_phone': business.get('display_phone'), 'business_categories': business.get('categories')}
            list_of_businesses.append(business_dict)
            #if business_id:
                #all_business_ids.append(business_id)

        # Update the offset in the URL for the next request
        if 'offset=' in base_url:
            base_url = base_url.rsplit('offset=', 1)[0] + f'offset={len(list_of_businesses)}'
        else:
            base_url += f'&offset={len(list_of_businesses)}'

        time.sleep(1)  

    return list_of_businesses

### We get the reviews from the business

In [94]:
def get_reviews(business_data, city):
    restaurant_ids = [restaurant['restaurant_id'] for restaurant in business_data]

    business_df = pd.DataFrame()
    count = 0
    
    for restaurant_id in restaurant_ids:
        url2 = "https://api.yelp.com/v3/businesses/" + restaurant_id + "/reviews?sort_by=yelp_sort"
        response = requests.get(url2, headers=headers)
        reviews_data = response.json()
        
        try:
            for review in reviews_data['reviews']:
                business_df.loc[count, 'text'] = review['text']
                business_df.loc[count, 'rating'] = review['rating']
                business_df.loc[count, 'location'] = city
                business_df.loc[count, 'restaurant_id'] = restaurant_id
                count += 1

        except:
            print("No reviews for this restaurant")
        
    return business_df

In [95]:
# cities = ['San+Diego', 'San+Jose', 'Seattle', 'Maryville', 'Salt+Lake+City', 'Oklahoma+City', 'Austin', 'Louisville', 'Indianapolis', 'Portland', 'Santa+Cruz', 'Jacksonville', 'Hudson', 'Dallas', 'Phoenix', 'Asheville']
# 'Memphis', 'Boston', 'Seattle', 'Denver', 'Washington', 'Nashville', 'Baltimore', 'Oklahoma+City', 'Louisville', 'Portland', 'Las+Vegas', 'Milwaukee', 'Albuquerque', 'Tucson', 'Fresno', 'Sacramento', 'Long+Beach', 'Kansas+City', 'Mesa'
# 'Atlanta', 'Raleigh', 'Miami', 'Omaha', 'Oakland', 'Tulsa', 'Minneapolis', 'Cleveland', 'Wichita', 'New+York', 'Los+Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San+Antonio', 'San+Diego', 'Dallas', 'San+Jose', 'Austin', 'Jacksonville', 'Indianapolis', 'San+Francisco', 'Columbus', 'Fort+Worth', 'Charlotte', 'Detroit', 'El+Paso', 'Arlington', 'New+Orleans', 'Bakersfield', 'Tampa', 'Honolulu', 'Aurora', 'Anaheim', 'Santa+Ana', 'St.+Louis',

cities = [
    'Riverside', 'Corpus+Christi', 'Lexington', 'Pittsburgh', 'Anchorage', 'Stockton', 'Cincinnati', 'Saint+Paul',
    'Toledo', 'Newark', 'Greensboro', 'Plano', 'Henderson', 'Lincoln', 'Buffalo', 'Fort+Wayne', 'Jersey+City',
    'Chula+Vista', 'Orlando', 'St.+Petersburg', 'Norfolk', 'Chandler', 'Laredo', 'Madison', 'Durham', 'Lubbock',
    'Winston-Salem', 'Garland', 'Glendale', 'Hialeah', 'Reno', 'Baton+Rouge', 'Irvine', 'Chesapeake', 'Irving',
    'Scottsdale', 'North+Las+Vegas', 'Fremont', 'Gilbert', 'San+Bernardino', 'Boise', 'Birmingham', 'Rochester',
    'Richmond', 'Spokane', 'Des+Moines', 'Montgomery', 'Modesto', 'Fayetteville', 'Tacoma', 'Shreveport', 'Fontana',
    'Oxnard', 'Aurora', 'Moreno+Valley', 'Akron', 'Yonkers', 'Columbus', 'Augusta', 'Little+Rock', 'Amarillo', 'Mobile',
    'Huntington+Beach', 'Glendale', 'Grand+Rapids', 'Salt+Lake+City', 'Tallahassee', 'Huntsville', 'Worcester',
    'Knoxville', 'Grand+Prairie', 'Newport+News', 'Brownsville', 'Santa+Clarita', 'Overland+Park', 'Providence',
    'Garden+Grove', 'Chattanooga', 'Oceanside', 'Santa+Rosa', 'Fort+Lauderdale', 'Rancho+Cucamonga', 'Port+St.+Lucie',
    'Ontario', 'Vancouver', 'Tempe', 'Springfield', 'Lancaster', 'Eugene', 'Pembroke+Pines', 'Salem', 'Cape+Coral',
    'Peoria', 'Sioux+Falls', 'Springfield', 'Elk+Grove', 'Rockford', 'Palmdale', 'Corona', 'Salinas', 'Pomona', 'Pasadena',
    'Joliet', 'Paterson', 'Kansas+City', 'Torrance', 'Syracuse', 'Bridgeport', 'Hayward', 'Fort+Collins', 'Escondido',
    'Lakewood', 'Naperville', 'Dayton', 'Hollywood', 'Sunnyvale', 'Alexandria', 'Mesquite', 'Hampton', 'Pasadena',
    'Orange', 'Savannah', 'Cary', 'Fullerton', 'Warren', 'Clarksville', 'McKinney', 'McAllen', 'New+Haven', 'Sterling+Heights',
    'West+Valley+City', 'Columbia', 'Killeen', 'Topeka', 'Thousand+Oaks', 'Cedar+Rapids', 'Olathe', 'Elizabeth', 'Waco',
    'Hartford', 'Visalia', 'Gainesville', 'Simi+Valley', 'Stamford', 'Bellevue', 'Concord', 'Miramar', 'Coral+Springs',
    'Lafayette', 'Charleston', 'Carrollton', 'Roseville', 'Thornton', 'Beaumont', 'Allentown', 'Surprise', 'Evansville',
    'Abilene', 'Frisco', 'Independence', 'Santa+Clara', 'Springfield', 'Vallejo', 'Victorville', 'Athens', 'Peoria',
    'Lansing', 'Ann+Arbor', 'El+Monte', 'Denton', 'Berkeley', 'Provo', 'Downey', 'Midland', 'Norman', 'Waterbury',
    'Costa+Mesa', 'Inglewood', 'Manchester', 'Murfreesboro', 'Columbia', 'Elgin', 'Clearwater', 'Miami+Gardens',
    'Rochester', 'Pueblo', 'Lowell', 'Wilmington', 'Arvada', 'Ventura', 'Westminster', 'West+Covina', 'Gresham',
    'Fargo', 'Norwalk', 'Carlsbad', 'Fairfield', 'Cambridge', 'Wichita+Falls', 'High+Point', 'Billings', 'Green+Bay',
    'West+Jordan', 'Richmond', 'Murrieta', 'Burbank', 'Palm+Bay', 'Everett', 'Flint', 'Antioch', 'Erie', 'South+Bend',
    'Daly+City', 'Centennial', 'Temecula', 'Rialto', 'Thornton', 'El+Paso', 'San+Mateo', 'Midland', 'Davenport', 'Santa+Monica',
    'Sandy+Springs', 'Boulder', 'Hillsboro', 'Frisco', 'Greeley', 'San+Bernardino', 'Jurupa+Valley', 'Kenosha', 'Rochester',
    'Olathe', 'Newport+Beach', 'Topeka', 'Athens', 'Santa+Clarita', 'Simi+Valley', 'Columbia', 'Concord', 'Lafayette',
    'Charleston', 'Carrollton', 'Roseville', 'Thornton'
]

business_df = pd.DataFrame()
business_data = pd.DataFrame()

for city in cities:
    url = (f'https://api.yelp.com/v3/businesses/search?location={city}&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')
    business_id = get_all_business_data(url)
    business_data = pd.concat([business_data, pd.DataFrame(business_id)], ignore_index=True)
    business_df = pd.concat([business_df, get_reviews(business_id, city)], ignore_index=True)

From the moment you're greeted! Ambiance is amazing! Love the decor, it's intimate and beautiful!

Food is...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id

When you walk in, the...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, '

No reviews for this restaurant



I loved everything. The wine was phenomenal. 
I...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id

We made reservations beacuse...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
Great drinks are provided on the menu as well as great food...' has dtype incompatible with float64, please explicitly cast to 

No reviews for this restaurant


  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id

I would honestly...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
After my last few downtown restaurant visits I will be honest that my expectation was not very high
. From the time we walked...' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  business_df.loc[count, 'text'] = review['text']
  business_df.loc[count, 'location'] = city
  business_df.loc[count, 'restaurant_id'] = restaurant_id
  business_df.loc[count, 'text'] = review['text']
  business_d

No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant


Il faudra juste faire un merge demain!

In [96]:
business_df['location'].unique()
# 2-Ba1KvwdY7MZQ3CSqB_1b1G8L_yFEHMN3GKT9wJYQD7rcI6GMzwEH1Q9p_fkJ-SdB01Nd63EcOZrMtiEC63V9zLLqRIpwrz7q2ne5mUwZ-utvwdbEbIntIkAdKjZXYx

array(['Atlanta', 'Raleigh', 'Miami', 'Omaha', 'Oakland', 'Tulsa',
       'Minneapolis', 'Cleveland', 'New+York', 'Los+Angeles', 'Chicago',
       'Houston', 'Phoenix', 'Philadelphia', 'San+Antonio', 'San+Diego',
       'Dallas', 'San+Jose', 'Austin', 'Jacksonville', 'Indianapolis',
       'San+Francisco', 'Columbus', 'Fort+Worth', 'Charlotte', 'Detroit',
       'El+Paso', 'Arlington', 'New+Orleans', 'Tampa', 'Honolulu',
       'Anaheim', 'Santa+Ana', 'St.+Louis', 'Riverside'], dtype=object)

In [97]:
business_df.to_csv('df-export-three.csv', index=False)
business_data.to_csv('id-export-three.csv', index=False)

#### New Orleans

In [7]:
import requests

new_orleans_url = ('https://api.yelp.com/v3/businesses/search?location=New+Orleans&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

new_orleans_restaurants_data = get_all_business_data(new_orleans_url)

new_orleans_restaurants_ids = [restaurant['business_id'] for restaurant in new_orleans_restaurants_data]

new_orleans_list_of_reviews = get_reviews(new_orleans_restaurants_ids,'New Orleans')

print(len(new_orleans_list_of_reviews))

No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
0


#### New York City

In [8]:
nyc_url = ('https://api.yelp.com/v3/businesses/search?location=New+York+City&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

nyc_restaurants_data = get_all_business_data(nyc_url)

nyc_restaurants_ids = [restaurant['business_id'] for restaurant in nyc_restaurants_data]

nyc_list_of_reviews = get_reviews(nyc_restaurants_ids,'New York City')

print(len(nyc_list_of_reviews))

No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant
No reviews for this restaurant


KeyboardInterrupt: 

#### Chicago

In [135]:
chicago_url = ('https://api.yelp.com/v3/businesses/search?location=Chicago&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

chicago_restaurants_data = get_all_business_data(chicago_url)

chicago_restaurants_ids = [restaurant['business_id'] for restaurant in chicago_restaurants_data]   

chicago_list_of_reviews = get_reviews(chicago_restaurants_ids,'Chicago')

print(len(chicago_list_of_reviews))

25


#### Los Angeles

In [136]:
los_angeles_url = "https://api.yelp.com/v3/businesses/search?location=Los+Angeles&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

los_angeles_restaurants_data = get_all_business_data(los_angeles_url)

los_angeles_restaurants_ids = [restaurant['business_id'] for restaurant in los_angeles_restaurants_data]

los_angeles_list_of_reviews = get_reviews(los_angeles_restaurants_ids,'Los Angeles')

print(len(los_angeles_list_of_reviews))

25


#### San Francisco

In [137]:
sf_url = "https://api.yelp.com/v3/businesses/search?location=San+Francisco&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

san_francisco_restaurants = get_all_business_data(sf_url)

san_francisco_restaurants_ids = [restaurant['business_id'] for restaurant in san_francisco_restaurants]

sf_list_of_reviews = get_reviews(san_francisco_restaurants_ids,'San Francisco')

print(len(sf_list_of_reviews))

25


#### Philadelphia

In [138]:
philadelphia_url = "https://api.yelp.com/v3/businesses/search?location=Philadelphia&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

philadelphia_restaurants = get_all_business_data(philadelphia_url)

philadelphia_restaurants_ids = [restaurant['business_id'] for restaurant in philadelphia_restaurants]

philadelphia_list_of_reviews = get_reviews(philadelphia_restaurants_ids,'Philadelphia')

print(len(philadelphia_list_of_reviews))

24


#### Las Vegas

In [139]:
las_vegas_url = "https://api.yelp.com/v3/businesses/search?location=Las+Vegas&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

las_vegas_restaurants = get_all_business_data(las_vegas_url)

las_vegas_restaurants_ids = [restaurant['business_id'] for restaurant in las_vegas_restaurants]

las_vegas_list_of_reviews = get_reviews(las_vegas_restaurants_ids,'Las Vegas')

print(len(las_vegas_list_of_reviews))

25


#### Houston

In [140]:
houston_url = "https://api.yelp.com/v3/businesses/search?location=Houston&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

houston_restaurants = get_all_business_data(houston_url)

houston_restaurants_ids = [restaurant['business_id'] for restaurant in houston_restaurants]

houston_list_of_reviews = get_reviews(houston_restaurants_ids,'Houston')

print(len(houston_list_of_reviews))

25


#### Phoenix

In [141]:
phoenix_url = "https://api.yelp.com/v3/businesses/search?location=Phoenix&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

phoenix_restaurants = get_all_business_data(phoenix_url)

phoenix_restaurants_ids = [restaurant['business_id'] for restaurant in phoenix_restaurants]

phoenix_list_of_reviews = get_reviews(phoenix_restaurants_ids,'Phoenix')

print(len(phoenix_list_of_reviews))

12


#### Miami

In [142]:
miami_url = "https://api.yelp.com/v3/businesses/search?location=Miami&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

miami_restaurants = get_all_business_data(miami_url)

miami_restaurants_ids = [restaurant['business_id'] for restaurant in miami_restaurants]

miami_list_of_reviews = get_reviews(miami_restaurants_ids,'Miami')

print(len(miami_list_of_reviews))

15


### Merge

In [153]:
ouput_dfs = []

cities = ['new_orleans', 'nyc', 'chicago', 'los_angeles', 'sf', 'philadelphia', 'las_vegas', 'houston', 'phoenix', 'miami']

for city in cities:
    reviews_list = globals()[f'{city}_list_of_reviews']
    ouput_df = pd.DataFrame(reviews_list, columns=['text', 'rating', 'location'])
    ouput_dfs.append(ouput_df)

output = pd.concat(ouput_dfs, ignore_index=True)
df = pd.concat([df, output], ignore_index=True)

In [157]:
df.drop_duplicates(inplace=True)
df['rating'].value_counts()

rating
5    367
4    134
3     69
2     29
1     19
Name: count, dtype: int64

In [158]:
df.to_csv('pre-yelp_reviews.csv', index=False)
df

Unnamed: 0,text,rating,location
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles
...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix
614,The culinary journey begins right at your tabl...,5,Miami
615,"Very nice ambiance. We went there at night, an...",4,New York City
616,M. whatever ... this is a hard pass.... I know...,1,New York City
