# Data Exploration and NLP Modeling 
## By BROSSEAU Alexandre & COGORDAN Alexandre

In [3]:
import nltk
import requests
import time
import pandas as pd
import os
import re
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import streamlit as st
import numpy as np
import tensorflow as tf
import tensorboard as tb

from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from collections import Counter
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from dotenv import find_dotenv, load_dotenv
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec
from sklearn.manifold import TSNE
from tensorboard.plugins import projector

load_dotenv()

/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/huggingface_hub/inference/_text_generation.py:121: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  @validator("best_of")
/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/huggingface_hub/inference/_text_generation.py:140: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  @validator("repetition_penalty")
/Users/alexandrecogorda

True

## Parameters

In [4]:
max_length_coef = 1.5
min_length_coef = 2

## Web scraping

### We get the requests and the dataframe we've created so far

In [2]:
df = pd.read_csv('yelp_reviews.csv')

NameError: name 'pd' is not defined

### We call our API key to start web scraping

In [3]:
api_key = os.getenv('YELP_API_KEY')
headers = {'Authorization': 'Bearer ' + api_key}

### We get the businesses' IDs

In [32]:
def get_all_business_ids(base_url):
    
    all_business_ids = []

    while True:
        response = requests.get(base_url, headers=headers)
        if response.status_code != 200:
            break  

        data = response.json()
        businesses = data.get('businesses', [])

        # business_info = data.get('businesses')

        if not businesses:
            break  # Break the loop if no more businesses are returned

        for business in businesses:
            business_id = business.get('id') # 'name', 'price', 'url', 'review_count', 'display_address', 'image_url', 'display_phone', 'categories'
            if business_id:
                all_business_ids.append(business_id)

        # Update the offset in the URL for the next request
        if 'offset=' in base_url:
            base_url = base_url.rsplit('offset=', 1)[0] + f'offset={len(all_business_ids)}'
        else:
            base_url += f'&offset={len(all_business_ids)}'

        time.sleep(1)  

    return all_business_ids


In [33]:
get_all_business_ids('https://api.yelp.com/v3/businesses/search?location=New%20York&limit=50')

{'businesses': [{'id': 'veq1Bl1DW3UWMekZJUsG1Q', 'alias': 'gramercy-tavern-new-york', 'name': 'Gramercy Tavern', 'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/l2oSnhyvJfWT6bufumBMzw/o.jpg', 'is_closed': False, 'url': 'https://www.yelp.com/biz/gramercy-tavern-new-york?adjust_creative=cM1zrjoabjXir-vOTu86eg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=cM1zrjoabjXir-vOTu86eg', 'review_count': 3441, 'categories': [{'alias': 'newamerican', 'title': 'New American'}], 'rating': 4.5, 'coordinates': {'latitude': 40.73844, 'longitude': -73.98825}, 'transactions': ['pickup'], 'price': '$$$$', 'location': {'address1': '42 E 20th St', 'address2': '', 'address3': '', 'city': 'New York', 'zip_code': '10003', 'country': 'US', 'state': 'NY', 'display_address': ['42 E 20th St', 'New York, NY 10003']}, 'phone': '+12124770777', 'display_phone': '(212) 477-0777', 'distance': 3695.6399277648}, {'id': 'B3_K2kUVbYOU0VaLcj_LTw', 'alias': 'thai-villa-new-york-2', 'name': 'Thai V

KeyboardInterrupt: 

In [26]:
businesses_elements

Unnamed: 0,name,review_count,url


### We get the reviews from the business

In [132]:
def get_reviews(restaurant_ids, city):
    list_of_reviews = []
    count = 0
    
    for i in range(len(restaurant_ids)):
        url2 = "https://api.yelp.com/v3/businesses/" + restaurant_ids[0][i] + "/reviews?sort_by=yelp_sort"
        response = requests.get(url2, headers=headers)
        reviews_data = response.json()
        
        try:
            for review in reviews_data['reviews']:
                review_dict = {'text': review['text'], 'rating': review['rating'],'location':city}
                list_of_reviews.append(review_dict)
                count += 1
                
                # We limit the number of reviews to 25 reviews per restaurant

                if count == 25: 
                    return list_of_reviews
        except:
            print("No reviews for this restaurant")
        
    return list_of_reviews

#### New Orleans

In [133]:
import requests

new_orleans_url = ('https://api.yelp.com/v3/businesses/search?location=New+Orleans&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

new_orleans_restaurant_ids = get_all_business_ids(new_orleans_url)

new_orleans_list_of_reviews = get_reviews(new_orleans_restaurant_ids,'New Orleans')

print(len(new_orleans_list_of_reviews))

25


#### New York City

In [134]:
nyc_url = ('https://api.yelp.com/v3/businesses/search?location=New+York+City&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

nyc_restaurant_ids = get_all_business_ids(nyc_url)

nyc_list_of_reviews = get_reviews(nyc_restaurant_ids,'New York City')

print(len(nyc_list_of_reviews))

25


#### Chicago

In [135]:
chicago_url = ('https://api.yelp.com/v3/businesses/search?location=Chicago&term=restaurants&categories=french&price=3&price=4&sort_by=best_match')

chicago_restaurant_ids = get_all_business_ids(chicago_url)

chicago_list_of_reviews = get_reviews(chicago_restaurant_ids,'Chicago')

print(len(chicago_list_of_reviews))

25


#### Los Angeles

In [136]:
los_angeles_url = "https://api.yelp.com/v3/businesses/search?location=Los+Angeles&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

los_angeles_restaurants = get_all_business_ids(los_angeles_url)

los_angeles_list_of_reviews = get_reviews(los_angeles_restaurants,'Los Angeles')

print(len(los_angeles_list_of_reviews))

25


#### San Francisco

In [137]:
sf_url = "https://api.yelp.com/v3/businesses/search?location=San+Francisco&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

san_francisco_restaurants = get_all_business_ids(sf_url)

sf_list_of_reviews = get_reviews(san_francisco_restaurants,'San Francisco')

print(len(sf_list_of_reviews))

25


#### Philadelphia

In [138]:
philadelphia_url = "https://api.yelp.com/v3/businesses/search?location=Philadelphia&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

philadelphia_restaurants = get_all_business_ids(philadelphia_url)

philadelphia_list_of_reviews = get_reviews(philadelphia_restaurants,'Philadelphia')

print(len(philadelphia_list_of_reviews))

24


#### Las Vegas

In [139]:
las_vegas_url = "https://api.yelp.com/v3/businesses/search?location=Las+Vegas&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

las_vegas_restaurants = get_all_business_ids(las_vegas_url)

las_vegas_list_of_reviews = get_reviews(las_vegas_restaurants,'Las Vegas')

print(len(las_vegas_list_of_reviews))

25


#### Houston

In [140]:
houston_url = "https://api.yelp.com/v3/businesses/search?location=Houston&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

houston_restaurants = get_all_business_ids(houston_url)

houston_list_of_reviews = get_reviews(houston_restaurants,'Houston')

print(len(houston_list_of_reviews))

25


#### Phoenix

In [141]:
phoenix_url = "https://api.yelp.com/v3/businesses/search?location=Phoenix&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

phoenix_restaurants = get_all_business_ids(phoenix_url)

phoenix_list_of_reviews = get_reviews(phoenix_restaurants,'Phoenix')

print(len(phoenix_list_of_reviews))

12


#### Miami

In [142]:
miami_url = "https://api.yelp.com/v3/businesses/search?location=Miami&term=restaurants&categories=french&price=4&price=3&sort_by=best_match"

miami_restaurants = get_all_business_ids(miami_url)

miami_list_of_reviews = get_reviews(miami_restaurants,'Miami')

print(len(miami_list_of_reviews))

15


### Merge

In [153]:
ouput_dfs = []

cities = ['new_orleans', 'nyc', 'chicago', 'los_angeles', 'sf', 'philadelphia', 'las_vegas', 'houston', 'phoenix', 'miami']

for city in cities:
    reviews_list = globals()[f'{city}_list_of_reviews']
    ouput_df = pd.DataFrame(reviews_list, columns=['text', 'rating', 'location'])
    ouput_dfs.append(ouput_df)

output = pd.concat(ouput_dfs, ignore_index=True)
df = pd.concat([df, output], ignore_index=True)

In [157]:
df.drop_duplicates(inplace=True)
df['rating'].value_counts()

rating
5    367
4    134
3     69
2     29
1     19
Name: count, dtype: int64

In [158]:
df.to_csv('yelp_reviews.csv', index=False)
df

Unnamed: 0,text,rating,location
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles
...,...,...,...
613,The Steak Tartare is absolutely yummy! Just as...,5,Phoenix
614,The culinary journey begins right at your tabl...,5,Miami
615,"Very nice ambiance. We went there at night, an...",4,New York City
616,M. whatever ... this is a hard pass.... I know...,1,New York City


## Data Cleaning, Summarisation, Translation & Generation

In [16]:
df = pd.read_csv('current_yelp_reviews.csv')

In [17]:
df.drop_duplicates(inplace=True)
df.dropna(subset=['text', 'rating', 'location'], inplace=True)

In [5]:
# Test phase, therefore we use 20 rows
df = df[0:20]

### Translation

#### We'll check the sentences with any accents (as it is a good indicator of non-english latin languages).

In [8]:
def contains_accents(text):
    return bool(re.search('[^\x00-\x7F]', text))

accent_sentences = []

for i, text in enumerate(df['text']):
    if contains_accents(text):
        accent_sentences.append(text)

accent_sentences

['生活的本質就是快樂，如果日子都過得不快樂，那人生還有什麼意義，年底了，是不是該好好清清自己的心房，遠離那些讓你不開心的人和事物呢～\n\n再介紹一家也是美美的環境， 很有自己特色的法國料理，這一家跟之前我介紹過卓別林故居的.  Republique brunch ，米其林一星 Manzke都是同一個老闆，關係企業，...',
 "OMG.\xa0 Yes !!\xa0 Top notch !!  Delicious !!  Perfection !!  Gorgeous !!  I can't say enough about 208 Rodeo.\xa0 It's in a class by itself !\n\nFirst of all the...",
 'Delicious French food with the perfect twist. The big mec is heavenly and everyone should try it. We also enjoyed the fennel salad and mushroom soup. Crème...',
 "The most amazing filet I've ever eaten. It was cooked to perfection, so tender, easy to eat. The crème brûlée is also the best ive ever had. Ozzy is an...",
 "Escargot\nRoasted beet salad vinegar être\nBaked potato tartine\nFilet with foie gras\nWatermelon salad\nMussels\nFrench onion soup\nIt's good food, good service...",
 "My friend and I came to La Sirène and overall had a good experience. \n\nI would say the only thing is this restaurant's deals were amazing and the food is...",
 "Brunch 

#### From what we've seen, the only language that needed to be translated was Chinese. The rest of the reviews are in english.

In [9]:
# We've used a chinese translator model to translate the chinese sentences
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

def contains_chinese(text):
    return bool(re.search('[\u4e00-\u9fff]', text))

def translate_text(text):
    if contains_chinese(text):
        return translator(text)[0]['translation_text']
    else:
        return text

### Data cleaning & Preprocessing

In [10]:
stop_words = set(stopwords.words('english'))

df['text'] = df['text'].astype(str)  # Convert the column to string

def preprocessing(text):
    # Corrected spelling
    corrected_text = TextBlob(text).correct()

    # Translation
    translated_text = translate_text(str(corrected_text))

    # Lower case
    lowercase_text = translated_text.lower()

    # Tokenization
    tokenised_text = word_tokenize(lowercase_text)

    # Remove punctuation and stop words
    cleaned_text = [word for word in tokenised_text if word.isalpha() and word not in stop_words]

    return cleaned_text

df['tokens'] = df['text'].apply(preprocessing)

In [4]:
# Word Frequency Analysis
all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)

# N-gram Analysis
bigrams = ngrams(all_words, 2)
bigram_freq = Counter(bigrams)

# Tri-gram Analysis
trigrams = ngrams(all_words, 3)
trigram_freq = Counter(trigrams)

# Example: Display most common words and bigrams
print(word_freq.most_common(10))
print(bigram_freq.most_common(10))
print(trigram_freq.most_common(10))

[("'", 16712), (',', 7738), (' ', 7738), ('e', 6985), ('a', 3959), ('r', 3677), ('t', 3668), ('i', 3541), ('n', 3434), ('s', 3148)]
[(("'", ','), 7738), ((',', ' '), 7738), ((' ', "'"), 7738), (('e', "'"), 1720), (('t', "'"), 1139), (('d', "'"), 1069), (('e', 'r'), 994), (("'", 's'), 982), (('i', 'n'), 966), (('r', 'e'), 942)]
[(("'", ',', ' '), 7738), ((',', ' ', "'"), 7738), (('e', "'", ','), 1599), (('t', "'", ','), 1050), (('d', "'", ','), 984), ((' ', "'", 's'), 927), (('y', "'", ','), 697), (('s', "'", ','), 693), (("'", ']', '['), 617), ((']', '[', "'"), 617)]


### Summarisation

In [96]:
average_length = df['text'].str.len().mean()
min_length = df['text'].str.len().min()
max_length = df['text'].str.len().max()
print('minimum:', min_length, 
      '\naverage', average_length, 
      ' \nmaximum', max_length)

minimum: 147 
average 154.33333333333334  
maximum 159


In [97]:
summariser = pipeline("summarization", model="facebook/bart-large-cnn")

summarised_text = df['text'].apply(lambda x: summariser(x, max_length=round(len(x)/max_length_coef), min_length=round(len(x)/min_length_coef), do_sample=False))

df['summarised_text'] = summarised_text.apply(lambda x: x[0]['summary_text'])

Your max_length is set to 106, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 105, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 103, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 106, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your

### Generation

#### With OpenAI

In [59]:
from langchain.chat_models import ChatOpenAI

api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

def generate_review(review):
    template = "Can you correct the spelling mistakes in that review and make sure that its phrasing is correct.: {review}"

    prompt = PromptTemplate(template=template, input_variables=["review"])

    generation_llm = LLMChain(prompt=prompt,
                              llm=ChatOpenAI(model_name="gpt-3.5-turbo", # Essayer avec gpt-4 - Il faut un compte pro pour faire les requêtes
                                         temperature=0.3), 
                              verbose=True)

    generation = generation_llm.run(review)

    return generation

generate_review(df['text'][0])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mCan you correct the spelling mistakes in that review and make sure that its phrasing is correct.: Robyn gave amazing service! So attentive and friendly, and she knew her stuff and was very knowledgeable. Best server I've had in a while! And the steak frites[0m


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

#### With HuggingFace

In [63]:
from langchain import HuggingFaceHub

def generate_review(review):
    template = "Can you correct the spelling mistakes in that review and make sure that its phrasing is correct.: {review}"

    prompt = PromptTemplate(template=template, input_variables=["review"])

    llm_chain = LLMChain(prompt=prompt, 
                        llm=HuggingFaceHub(repo_id="google/flan-t5-xl",
                                           model_kwargs={"temperature":0.25}))

    generation = llm_chain.run(review)

    return generation

generate_review(df['text'][0])



ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: ab155e09-aaa5-47bb-91d8-89e654649a51)')

## Topic Modeling

### For our topic modeling, we'll use LDA (Latent Dirichlet Allocation)

### Lemmatisation

In [109]:
def lemmatisation(reviews, allowed_postags=["NOUN", "ADJ", "VERBS", "ADV"]):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    reviews_out = []

    for review in reviews:
        doc = nlp(review) 
        reviews_out.append(" ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in stopwords]))

    return reviews_out

lemmatised_text = lemmatisation(df['summarised_text'])

### Further formatting - We're removing the stopwords and outputting the reviews like keywords.

In [110]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

data_words = gen_words(lemmatised_text)

### On convertit nos mots en tuples. Ces tuples auront l'index du mot (son emplacement) et sa fréquence.

In [111]:
id2word = corpora.Dictionary(data_words)

corpus = [id2word.doc2bow(text) for text in data_words]

### The LDA Model

In [118]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [119]:
pyLDAvis.enable_notebook(local=True)

vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', R=10)

pyLDAvis.display(vis)



## Embedding to Identify Similar Words / Model with pretrained-embedding

### Using word2vec

In [18]:
# Word2Vec Model based on the tokenised reviews

model_w2v = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)
model_w2v.save("word2vec.model")

### Using GloVe

TO DO!

### Visualization of Embeddings

In [19]:
# Load your Word2Vec model
model_w2v = Word2Vec.load("word2vec.model")

# Prepare the embedding matrix
max_size = len(model_w2v.wv.index_to_key)
embedding_dim = model_w2v.vector_size
w2v = np.zeros((max_size, embedding_dim))

# Create a directory for TensorBoard logs if it doesn't exist
if not os.path.exists("runs"):
    os.makedirs("runs")

# Save the metadata (words)
with open("runs/metadata.tsv", "w+") as file_metadata:
    for i, word in enumerate(model_w2v.wv.index_to_key[:max_size]):
        w2v[i] = model_w2v.wv[word]
        file_metadata.write(word + '\n')

# Create a TensorFlow embedding variable
embedding_var = tf.Variable(w2v, trainable=False, name='embedding')

# Create a checkpoint from embedding
checkpoint = tf.train.Checkpoint(embedding=embedding_var)
checkpoint.save(os.path.join("runs", "embedding.ckpt"))

# Set up the projector config
config = projector.ProjectorConfig()
embedding_config = config.embeddings.add()
embedding_config.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding_config.metadata_path = 'metadata.tsv'

# Write the projector config file
projector.visualize_embeddings("runs", config)

# Start TensorBoard: tensorboard --logdir=runs.
# Access TensorBoard: http://localhost:6006

### Euclidian / Cosine distance

In [14]:
# Distance between two words
dist = euclidean(model_w2v.wv['french'], model_w2v.wv['good'])
print('distance', dist)

# Similarity between two words
similarity = 1 - cosine(model_w2v.wv['french'], model_w2v.wv['good'])
print('similarity:', similarity)

distance 0.08259755373001099
similarity: 0.7025662064552307


### Semantic search with an implementation of the cosine distance

In [24]:
def semantic_search(query, model, top_n=5):
    query_vector = model.wv[query]
    similarities = {}

    for word in model.wv.index_to_key:
        if word != query:
            similarities[word] = 1 - cosine(query_vector, model.wv[word])
    
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Example usage - We'll have to use this in our application for our user
results = semantic_search('service', model_w2v)
print(results)

[('experience', 0.7815940380096436), ('french', 0.7814168334007263), ('menu', 0.7722441554069519), ('restaurant', 0.7664408683776855), ('food', 0.7536170482635498)]


## Supervised Learning

### TF-IDF

Maybe use the n-grams here!

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

df = pd.read_csv('current_yelp_reviews.csv')

# Assuming 'cleaned_texts' is your preprocessed text data
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['tokens'])

# Example with a Naive Bayes classifier for a classification task
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2)
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9)

NameError: name 'labels' is not defined

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv('current_yelp_reviews.csv')

# Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['tokens'])  # cleaned_texts is your preprocessed text data

# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2)

# Training a classifier (e.g., Naive Bayes)
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

NameError: name 'labels' is not defined

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train your model
# model.fit(...)

In [27]:
import tensorflow_hub as hub

# Load the pre-trained Universal Sentence Encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

df = pd.read_csv('current_yelp_reviews.csv')

# Embed a list of sentences
embeddings = embed(df['text'])

print(embeddings)

# Use these embeddings for further tasks

# Let's juste use BERT but still try USE

2024-01-16 00:23:44.385607: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-01-16 00:23:46.592932: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


tf.Tensor(
[[-0.05521629  0.00166136  0.01443822 ... -0.03156644 -0.03464142
  -0.00233329]
 [-0.01928866 -0.01288783  0.01626975 ... -0.04711485 -0.04085484
  -0.00710738]
 [ 0.01989324  0.01594443 -0.0640111  ...  0.03277976 -0.0025236
   0.02216191]
 ...
 [-0.00392908  0.07070225  0.07606924 ... -0.03801232  0.06217748
   0.03279895]
 [-0.05066155 -0.01717728  0.02526633 ... -0.06212594  0.00733587
   0.00824614]
 [ 0.02396129 -0.04169538  0.03485788 ... -0.03459991 -0.04811675
  -0.00109951]], shape=(618, 512), dtype=float32)


In [None]:
# Let's use gpt 2!