# Data Exploration and NLP Modeling 
## By BROSSEAU Alexandre & COGORDAN Alexandre

## Web scraping

In [44]:
import requests
import json
import time
import pandas as pd
import os

from dotenv import find_dotenv, load_dotenv

load_dotenv()

True

### We get the requests and the dataframe we've created so far

In [129]:
df = pd.read_csv('yelp_reviews.csv')

### We call our API key to start web scraping

In [38]:
api_key = os.getenv('YELP_API_KEY')
headers = {'Authorization': 'Bearer ' + api_key}

### We get the businesses' IDs

In [131]:
def get_all_business_ids(base_url):
    
    all_business_ids = []

    while True:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            break  

        data = response.json()
        businesses = data.get('businesses', [])
        if not businesses:
            break  # Break the loop if no more businesses are returned

        for business in businesses:
            business_id = business.get('id')
            if business_id:
                all_business_ids.append(business_id)

        # Update the offset in the URL for the next request
        if 'offset=' in base_url:
            base_url = base_url.rsplit('offset=', 1)[0] + f'offset={len(all_business_ids)}'
        else:
            base_url += f'&offset={len(all_business_ids)}'

        time.sleep(1)  

    return all_business_ids


### We get the reviews from the business

OFFSET A CHANGER (+25) toutes les nouvelles requests

In [132]:
def get_reviews(restaurant_ids, city):
    list_of_reviews = []
    count = 0
    
    for i in range(len(restaurant_ids)):
        url2 = "https://api.yelp.com/v3/businesses/" + restaurant_ids[i] + "/reviews?offset=25&limit=25&sort_by=yelp_sort"
        response = requests.get(url2, headers=headers)
        reviews_data = response.json()
        
        try:
            for review in reviews_data['reviews']:
                review_dict = {'text': review['text'], 'rating': review['rating'],'location':city}
                list_of_reviews.append(review_dict)
                count += 1
                
                # We limit the number of reviews to 25 reviews per restaurant

                if count == 25: 
                    return list_of_reviews
        except:
            print("No reviews for this restaurant")
        
    return list_of_reviews

#### New Orleans

In [133]:
import requests

new_orleans_url = ('https://api.yelp.com/v3/businesses/search?location=New+Orleans&term=restaurants&categories=french&price=3&price=4&sort_by=best_match&limit=50&offset=0')

new_orleans_restaurant_ids = get_all_business_ids(new_orleans_url)

new_orleans_list_of_reviews = get_reviews(new_orleans_restaurant_ids,'New Orleans')

print(len(new_orleans_list_of_reviews))

25


#### New York City

In [134]:
nyc_url = ('https://api.yelp.com/v3/businesses/search?location=New+York+City&term=restaurants&categories=french&price=3&price=4&sort_by=best_match&limit=50&offset=0')

nyc_restaurant_ids = get_all_business_ids(nyc_url)

nyc_list_of_reviews = get_reviews(nyc_restaurant_ids,'New York City')

print(len(nyc_list_of_reviews))

25


#### Chicago

In [135]:
chicago_url = ('https://api.yelp.com/v3/businesses/search?location=Chicago&term=restaurants&categories=french&price=3&price=4&sort_by=best_match&limit=50&offset=0')

chicago_restaurant_ids = get_all_business_ids(chicago_url)

chicago_list_of_reviews = get_reviews(chicago_restaurant_ids,'Chicago')

print(len(chicago_list_of_reviews))

25


#### Los Angeles

In [136]:
los_angeles_url = "https://api.yelp.com/v3/businesses/search?location=Los+Angeles&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

los_angeles_restaurants = get_all_business_ids(los_angeles_url)

los_angeles_list_of_reviews = get_reviews(los_angeles_restaurants,'Los Angeles')

print(len(los_angeles_list_of_reviews))

25


#### San Francisco

In [137]:
sf_url = "https://api.yelp.com/v3/businesses/search?location=San+Francisco&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

san_francisco_restaurants = get_all_business_ids(sf_url)

sf_list_of_reviews = get_reviews(san_francisco_restaurants,'San Francisco')

print(len(sf_list_of_reviews))

25


#### Philadelphia

In [138]:
philadelphia_url = "https://api.yelp.com/v3/businesses/search?location=Philadelphia&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

philadelphia_restaurants = get_all_business_ids(philadelphia_url)

philadelphia_list_of_reviews = get_reviews(philadelphia_restaurants,'Philadelphia')

print(len(philadelphia_list_of_reviews))

24


#### Las Vegas

In [139]:
las_vegas_url = "https://api.yelp.com/v3/businesses/search?location=Las+Vegas&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

las_vegas_restaurants = get_all_business_ids(las_vegas_url)

las_vegas_list_of_reviews = get_reviews(las_vegas_restaurants,'Las Vegas')

print(len(las_vegas_list_of_reviews))

25


#### Houston

In [140]:
houston_url = "https://api.yelp.com/v3/businesses/search?location=Houston&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

houston_restaurants = get_all_business_ids(houston_url)

houston_list_of_reviews = get_reviews(houston_restaurants,'Houston')

print(len(houston_list_of_reviews))

25


#### Phoenix

In [141]:
phoenix_url = "https://api.yelp.com/v3/businesses/search?location=Phoenix&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

phoenix_restaurants = get_all_business_ids(phoenix_url)

phoenix_list_of_reviews = get_reviews(phoenix_restaurants,'Phoenix')

print(len(phoenix_list_of_reviews))

12


#### Miami

In [142]:
miami_url = "https://api.yelp.com/v3/businesses/search?location=Miami&term=restaurants&categories=french&price=4&price=3&sort_by=best_match&limit=50&offset=0"

miami_restaurants = get_all_business_ids(miami_url)

miami_list_of_reviews = get_reviews(miami_restaurants,'Miami')

print(len(miami_list_of_reviews))

15


### Merge

In [153]:
ouput_dfs = []

cities = ['new_orleans', 'nyc', 'chicago', 'los_angeles', 'sf', 'philadelphia', 'las_vegas', 'houston', 'phoenix', 'miami']

for city in cities:
    reviews_list = globals()[f'{city}_list_of_reviews']
    ouput_df = pd.DataFrame(reviews_list, columns=['text', 'rating', 'location'])
    ouput_dfs.append(ouput_df)

output = pd.concat(ouput_dfs, ignore_index=True)
df = pd.concat([df, output], ignore_index=True)

In [157]:
df.drop_duplicates(inplace=True)
df['rating'].value_counts()

rating
5    367
4    134
3     69
2     29
1     19
Name: count, dtype: int64

In [158]:
df.to_csv('yelp_reviews.csv', index=False)
df

Unnamed: 0,text,rating,location
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles
...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix
614,The culinary journey begins right at your tabl...,5,Miami
615,"Very nice ambiance. We went there at night, an...",4,New York City
616,M. whatever ... this is a hard pass.... I know...,1,New York City


## Data Cleaning

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from textblob import Word
from collections import Counter

In [3]:
df = pd.read_csv('yelp_reviews.csv')

In [4]:
df.drop_duplicates(inplace=True)
df.dropna(subset=['text', 'rating', 'location'], inplace=True)

Do we need to to the spelling correction before ot after the tokenization ??

In [5]:
stop_words = set(stopwords.words('english'))
 
def preprocess_text(text):
    # Lowercase
    lowercase_text = text.lower()
    # Tokenization
    tokens = word_tokenize(lowercase_text)
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['text'] = df['text'].astype(str)  # Convert the column to string
df['text'] = df['text'].apply(lambda x: str(TextBlob(x)))  # Apply TextBlob to each element

df['tokens'] = df['text'].apply(preprocess_text)
#df['tokens'] = df['tokens'].apply(lambda x: [Word(word).spellcheck() for word in x])  


In [6]:
# Word Frequency Analysis
all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)

# N-gram Analysis
bigrams = ngrams(all_words, 2)
bigram_freq = Counter(bigrams)

# Example: Display most common words and bigrams
print(word_freq.most_common(10))
print(bigram_freq.most_common(10))

[('food', 181), ('service', 140), ('restaurant', 123), ('great', 102), ('place', 93), ('french', 89), ('came', 83), ('dinner', 78), ('good', 73), ('experience', 67)]
[(('french', 'onion'), 22), (('onion', 'soup'), 22), (('dining', 'experience'), 17), (('food', 'service'), 15), (('new', 'york'), 15), (('restaurant', 'week'), 13), (('service', 'great'), 12), (('first', 'time'), 12), (('amazing', 'service'), 11), (('french', 'restaurant'), 10)]


## Summary, Translation & Generation

## Part 1 - testing and such

In [7]:
from dotenv import find_dotenv, load_dotenv
from transformers import pipeline, set_seed, T5Tokenizer, T5Model

load_dotenv(find_dotenv())

True

In [8]:
df=pd.read_csv('yelp_reviews.csv')

### We're using huggingface's pipelines

### Translation

In [82]:
# EU Translation

# eu_translator_tokenizer = T5Tokenizer.from_pretrained("t5-small")
# eu_translator_model = T5Model.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [83]:
# from transformers import T5Tokenizer

# input_ids = eu_translator_tokenizer.encode(input_text, return_tensors='pt')
# outputs = eu_translator_model.generate(input_ids)
# output_text = eu_translator_tokenizer.decode(outputs[0], skip_special_tokens=True)

TypeError: The current model class (T5Model) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'T5ForConditionalGeneration'}

In [None]:
# Chinese translation

import re

def contains_chinese(text):
    return bool(re.search('[\u4e00-\u9fff]', text))

sentence = "这是一个例子 example"
if contains_chinese(sentence):
    print("Contains Chinese characters")
else:
    print("Does not contain Chinese characters")

### Generation

In [93]:
generator = pipeline("text-generation", model="gpt2")

set_seed(42)

generated_text = df['text'].apply(lambda x: generator(x, max_length=len(x) + 50, min_length=30))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [101]:
df['generated_text'] = generated_text.apply(lambda x: x[0]['generated_text'])

In [102]:
df

Unnamed: 0,text,rating,location,summarised_text,summarised_text_two,generated_text
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles,[{'summary_text': 'Best server I've had in a w...,Best server I've had in a while! And the steak...,Robyn gave amazing service! So attentive and f...
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles,[{'summary_text': 'Headed downtown on a Thursd...,Headed downtown on a Thursday evening for a Ki...,Headed downtown on a Thursday evening for a Ki...
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles,[{'summary_text': 'I've been back to the hotel...,I've been back to the hotel a few times in rec...,"Been here a few times, in just recent weeks. T..."
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles,[{'summary_text': 'The food is elevated to a w...,The food is elevated to a whole 'notha level. ...,Service is fast. Staff is friendly. The food i...
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles,[{'summary_text': 'French-influenced concept i...,French-influenced concept is a nice change of ...,Walked by and asked to see a menu. Very helpfu...
...,...,...,...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix,[{'summary_text': 'The Steak Tartare is absolu...,The Steak Tartare is absolutely yummy! Just as...,The Steak Tartare is absolutely yummy! Just as...
614,The culinary journey begins right at your tabl...,5,Miami,[{'summary_text': 'The culinary journey begins...,The culinary journey begins right at your tabl...,The culinary journey begins right at your tabl...
615,"Very nice ambiance. We went there at night, an...",4,New York City,[{'summary_text': 'Inside was filled with warm...,Inside was filled with warm lighting with some...,"Very nice ambiance. We went there at night, an..."
616,M. whatever ... this is a hard pass.... I know...,1,New York City,[{'summary_text': 'M. whatever ... this is a h...,M. whatever ... this is a hard pass.... I know...,M. whatever ... this is a hard pass.... I know...


### Summarisation

In [73]:
average_length = df['text'].str.len().mean()
min_length = df['text'].str.len().min()
max_length = df['text'].str.len().max()
print('minimum:', min_length, 
      '\naverage', average_length, 
      ' \nmaximum', max_length)

minimum: 85 
average 153.38025889967636  
maximum 159


In [11]:
summariser = pipeline("summarization", model="facebook/bart-large-cnn")

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

For the summarisation task, we decided to go for half of the average review length.

In [32]:
summarised_text = df['generated_text'].apply(lambda x: summariser(x, max_length=round(153/2), min_length=20, do_sample=False))

Your max_length is set to 76, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 76, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 76, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 76, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max

In [34]:
df['summarised_text'] = summarised_text[0]['summary_text']

Unnamed: 0,text,rating,location,summarised_text
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles,[{'summary_text': 'Best server I've had in a w...
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles,[{'summary_text': 'Headed downtown on a Thursd...
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles,[{'summary_text': 'I've been back to the hotel...
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles,[{'summary_text': 'The food is elevated to a w...
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles,[{'summary_text': 'French-influenced concept i...
...,...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix,[{'summary_text': 'The Steak Tartare is absolu...
614,The culinary journey begins right at your tabl...,5,Miami,[{'summary_text': 'The culinary journey begins...
615,"Very nice ambiance. We went there at night, an...",4,New York City,[{'summary_text': 'Inside was filled with warm...
616,M. whatever ... this is a hard pass.... I know...,1,New York City,[{'summary_text': 'M. whatever ... this is a h...


### Further generation

In [None]:
generator = pipeline("text-generation", model="gpt2")

set_seed(42)

generated_text = df['summarised_text'].apply(lambda x: generator(x, max_length=len(x) + 50, min_length=20))

Maybe use generation with GPT-3 to make sure that the reviews make sense!

### We're using GPT-3

### Generation

In [55]:
# from typing_extensions import TypeAliasType
# from langchain import PromptTemplate, LLMChain, OpenAI

# def generate_review(review):
#     template = "Can you correct the spelling mistakes and make sure that the sentence makes sense: {review}"

#     prompt = PromptTemplate(template, input_variables=["review"])

#     generation_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)

#     generation = generation_llm.predict(review=review)

#     return generation

ImportError: cannot import name 'TypeAliasType' from 'typing_extensions' (/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/typing_extensions.py)

## Part 2 - Further testing

In [60]:
import re

from transformers import pipeline
from langchain import PromptTemplate, LLMChain

df = pd.read_csv('yelp_reviews.csv')

### Parameters

In [10]:
max_length_coef = 1.5
min_length_coef = 2

### Translation

In [26]:
df = df[:20] # to remove

#### We'll check the sentences with any accents (as it is a good indicator of non-english latin languages).

In [12]:
def contains_accents(text):
    return bool(re.search('[^\x00-\x7F]', text))

accent_sentences = []

for i, text in enumerate(df['text']):
    if contains_accents(text):
        accent_sentences.append(text)

accent_sentences

['生活的本質就是快樂，如果日子都過得不快樂，那人生還有什麼意義，年底了，是不是該好好清清自己的心房，遠離那些讓你不開心的人和事物呢～\n\n再介紹一家也是美美的環境， 很有自己特色的法國料理，這一家跟之前我介紹過卓別林故居的.  Republique brunch ，米其林一星 Manzke都是同一個老闆，關係企業，...']

#### From what we've seen, the only language that needed to be translated was Chinese. The rest of the reviews are in english.

In [13]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

def contains_chinese(text):
    return bool(re.search('[\u4e00-\u9fff]', text))

for i, text in enumerate(df['text']):
    if contains_chinese(text):
        df.loc[i, 'text'] = translator(text)[0]['translation_text']

### Summarisation

In [14]:
average_length = df['text'].str.len().mean()
min_length = df['text'].str.len().min()
max_length = df['text'].str.len().max()
print('minimum:', min_length, 
      '\naverage', average_length, 
      ' \nmaximum', max_length)

minimum: 147 
average 157.05  
maximum 200


In [15]:
summariser = pipeline("summarization", model="facebook/bart-large-cnn")

summarised_text = df['text'].apply(lambda x: summariser(x, max_length=round(len(x)/max_length_coef), min_length=round(len(x)/min_length_coef), do_sample=False))

Your max_length is set to 106, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 105, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 103, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 106, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your

### Generation

#### With OpenAI

In [59]:
from langchain.chat_models import ChatOpenAI

api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

def generate_review(review):
    template = "Can you correct the spelling mistakes in that review and make sure that its phrasing is correct.: {review}"

    prompt = PromptTemplate(template=template, input_variables=["review"])

    generation_llm = LLMChain(prompt=prompt,
                              llm=ChatOpenAI(model_name="gpt-3.5-turbo", # Essayer avec gpt-4 - Il faut un compte pro pour faire les requêtes
                                         temperature=0.3), 
                              verbose=True)

    generation = generation_llm.run(review)

    return generation

generate_review(df['text'][0])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mCan you correct the spelling mistakes in that review and make sure that its phrasing is correct.: Robyn gave amazing service! So attentive and friendly, and she knew her stuff and was very knowledgeable. Best server I've had in a while! And the steak frites[0m


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

#### With HuggingFace

In [61]:
from langchain import HuggingFaceHub

def generate_review(review):
    template = "Can you correct the spelling mistakes in that review and make sure that its phrasing is correct.: {review}"

    prompt = PromptTemplate(template=template, input_variables=["review"])

    llm_chain = LLMChain(prompt=prompt, 
                        llm=HuggingFaceHub(repo_id="google/flan-t5-xl",
                                           model_kwargs={"temperature":0.25}))

    generation = llm_chain.run(review)

    return generation

generate_review(df['text'][0])



ValidationError: 1 validation error for HuggingFaceHub
temperature
  extra fields not permitted (type=value_error.extra)

## Topic Modeling

### For our topic modeling, we'll use LDA (Latent Dirichlet Allocation)