# Data Exploration and NLP Modeling 
## By BROSSEAU Alexandre & COGORDAN Alexandre

In [128]:
import requests
import json
import time
import pandas as pd

## Web scraping

### We get the requests and the dataframe we've created so far

In [129]:
df = pd.read_csv('yelp_reviews.csv')

### We call our API key to start web scraping

In [130]:
api_key = '2-Ba1KvwdY7MZQ3CSqB_1b1G8L_yFEHMN3GKT9wJYQD7rcI6GMzwEH1Q9p_fkJ-SdB01Nd63EcOZrMtiEC63V9zLLqRIpwrz7q2ne5mUwZ-utvwdbEbIntIkAdKjZXYx' # Alexandre COGORDAN's API key
headers = {'Authorization': 'Bearer ' + api_key}

### We get the businesses' IDs

In [131]:
def get_all_business_ids(base_url):
    
    all_business_ids = []

    while True:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            break  

        data = response.json()
        businesses = data.get('businesses', [])
        if not businesses:
            break  # Break the loop if no more businesses are returned

        for business in businesses:
            business_id = business.get('id')
            if business_id:
                all_business_ids.append(business_id)

        # Update the offset in the URL for the next request
        if 'offset=' in base_url:
            base_url = base_url.rsplit('offset=', 1)[0] + f'offset={len(all_business_ids)}'
        else:
            base_url += f'&offset={len(all_business_ids)}'

        time.sleep(1)  

    return all_business_ids


### We get the reviews from the business

OFFSET A CHANGER (+25) toutes les nouvelles requests

In [132]:
def get_reviews(restaurant_ids, city):
    list_of_reviews = []
    count = 0
    
    for i in range(len(restaurant_ids)):
        url2 = "https://api.yelp.com/v3/businesses/" + restaurant_ids[i] + "/reviews?offset=25&limit=25&sort_by=yelp_sort"
        response = requests.get(url2, headers=headers)
        reviews_data = response.json()
        
        try:
            for review in reviews_data['reviews']:
                review_dict = {'text': review['text'], 'rating': review['rating'],'location':city}
                list_of_reviews.append(review_dict)
                count += 1
                
                # We limit the number of reviews to 25 reviews per restaurant

                if count == 25: 
                    return list_of_reviews
        except:
            print("No reviews for this restaurant")
        
    return list_of_reviews

#### New Orleans

In [133]:
import requests

new_orleans_url = ('https://api.yelp.com/v3/businesses/search?location=New+Orleans&term=restaurants&categories=french&price=3&price=4&sort_by=best_match&limit=50&offset=0')

new_orleans_restaurant_ids = get_all_business_ids(new_orleans_url)

new_orleans_list_of_reviews = get_reviews(new_orleans_restaurant_ids,'New Orleans')

print(len(new_orleans_list_of_reviews))

25


#### New York City

In [134]:
nyc_url = ('https://api.yelp.com/v3/businesses/search?location=New+York+City&term=restaurants&categories=french&price=3&price=4&sort_by=best_match&limit=50&offset=0')

nyc_restaurant_ids = get_all_business_ids(nyc_url)

nyc_list_of_reviews = get_reviews(nyc_restaurant_ids,'New York City')

print(len(nyc_list_of_reviews))

25


#### Chicago

In [135]:
chicago_url = ('https://api.yelp.com/v3/businesses/search?location=Chicago&term=restaurants&categories=french&price=3&price=4&sort_by=best_match&limit=50&offset=0')

chicago_restaurant_ids = get_all_business_ids(chicago_url)

chicago_list_of_reviews = get_reviews(chicago_restaurant_ids,'Chicago')

print(len(chicago_list_of_reviews))

25


#### Los Angeles

In [136]:
los_angeles_url = "https://api.yelp.com/v3/businesses/search?location=Los+Angeles&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

los_angeles_restaurants = get_all_business_ids(los_angeles_url)

los_angeles_list_of_reviews = get_reviews(los_angeles_restaurants,'Los Angeles')

print(len(los_angeles_list_of_reviews))

25


#### San Francisco

In [137]:
sf_url = "https://api.yelp.com/v3/businesses/search?location=San+Francisco&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

san_francisco_restaurants = get_all_business_ids(sf_url)

sf_list_of_reviews = get_reviews(san_francisco_restaurants,'San Francisco')

print(len(sf_list_of_reviews))

25


#### Philadelphia

In [138]:
philadelphia_url = "https://api.yelp.com/v3/businesses/search?location=Philadelphia&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

philadelphia_restaurants = get_all_business_ids(philadelphia_url)

philadelphia_list_of_reviews = get_reviews(philadelphia_restaurants,'Philadelphia')

print(len(philadelphia_list_of_reviews))

24


#### Las Vegas

In [139]:
las_vegas_url = "https://api.yelp.com/v3/businesses/search?location=Las+Vegas&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

las_vegas_restaurants = get_all_business_ids(las_vegas_url)

las_vegas_list_of_reviews = get_reviews(las_vegas_restaurants,'Las Vegas')

print(len(las_vegas_list_of_reviews))

25


#### Houston

In [140]:
houston_url = "https://api.yelp.com/v3/businesses/search?location=Houston&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

houston_restaurants = get_all_business_ids(houston_url)

houston_list_of_reviews = get_reviews(houston_restaurants,'Houston')

print(len(houston_list_of_reviews))

25


#### Phoenix

In [141]:
phoenix_url = "https://api.yelp.com/v3/businesses/search?location=Phoenix&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

phoenix_restaurants = get_all_business_ids(phoenix_url)

phoenix_list_of_reviews = get_reviews(phoenix_restaurants,'Phoenix')

print(len(phoenix_list_of_reviews))

12


#### Miami

In [142]:
miami_url = "https://api.yelp.com/v3/businesses/search?location=Miami&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

miami_restaurants = get_all_business_ids(miami_url)

miami_list_of_reviews = get_reviews(miami_restaurants,'Miami')

print(len(miami_list_of_reviews))

15


### Merge

In [153]:
ouput_dfs = []

cities = ['new_orleans', 'nyc', 'chicago', 'los_angeles', 'sf', 'philadelphia', 'las_vegas', 'houston', 'phoenix', 'miami']

for city in cities:
    reviews_list = globals()[f'{city}_list_of_reviews']
    ouput_df = pd.DataFrame(reviews_list, columns=['text', 'rating', 'location'])
    ouput_dfs.append(ouput_df)

output = pd.concat(ouput_dfs, ignore_index=True)
df = pd.concat([df, output], ignore_index=True)

In [157]:
df.drop_duplicates(inplace=True)
df['rating'].value_counts()

rating
5    367
4    134
3     69
2     29
1     19
Name: count, dtype: int64

In [158]:
df.to_csv('yelp_reviews.csv', index=False)
df

Unnamed: 0,text,rating,location
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles
...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix
614,The culinary journey begins right at your tabl...,5,Miami
615,"Very nice ambiance. We went there at night, an...",4,New York City
616,M. whatever ... this is a hard pass.... I know...,1,New York City


## Data Cleaning

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from textblob import Word
from collections import Counter

In [8]:
df = pd.read_csv('yelp_reviews.csv')

In [9]:
df.drop_duplicates(inplace=True)
df.dropna(subset=['text', 'rating', 'location'], inplace=True)

Do we need to to the spelling correction before ot after the tokenization ??

In [10]:
stop_words = set(stopwords.words('english'))

 
def preprocess_text(text):
    # Lowercase
    lowercase_text = text.lower()
    # Tokenization
    tokens = word_tokenize(lowercase_text)
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['text'] = df['text'].astype(str)  # Convert the column to string
df['text'] = df['text'].apply(lambda x: str(TextBlob(x)))  # Apply TextBlob to each element

df['tokens'] = df['text'].apply(preprocess_text)
#df['tokens'] = df['tokens'].apply(lambda x: [Word(word).spellcheck() for word in x])  


In [11]:
# Word Frequency Analysis
all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)

# N-gram Analysis
bigrams = ngrams(all_words, 2)
bigram_freq = Counter(bigrams)

# Example: Display most common words and bigrams
print(word_freq.most_common(10))
print(bigram_freq.most_common(10))

[('food', 138), ('service', 106), ('restaurant', 88), ('place', 77), ('great', 77), ('french', 71), ('came', 67), ('dinner', 61), ('good', 57), ('menu', 51)]
[(('french', 'onion'), 18), (('onion', 'soup'), 18), (('new', 'york'), 14), (('food', 'service'), 13), (('service', 'great'), 12), (('amazing', 'service'), 9), (('dining', 'experience'), 9), (('service', 'excellent'), 9), (('first', 'time'), 9), (('food', 'good'), 8)]


## Summary, Translation & Generation