In [37]:
# Imports

import nltk
import requests
import time
import pandas as pd
import os
import re
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import streamlit as st

from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from collections import Counter
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from dotenv import find_dotenv, load_dotenv
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec
from sklearn.manifold import TSNE
from PIL import Image

load_dotenv()

# ---------- Parameters ---------- #

max_length_coef = 1.5
min_length_coef = 2

# ---------- Functions ---------- #

# ---------- Loading the dataset ---------- #

df = pd.read_csv('current_yelp_reviews.csv')

df.drop_duplicates(inplace=True)
df.dropna(subset=['text', 'rating', 'location'], inplace=True)

# ---------- Preprocessing ---------- #

# Translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].astype(str)

# Check if text contains Chinese characters
def contains_chinese(text):
    return bool(re.search('[\u4e00-\u9fff]', text))

# Translation function (from Chinese to English)
def translate_text(text):
    if contains_chinese(text):
        return translator(text)[0]['translation_text']
    else:
        return text

In [38]:
# Lemmatisation & Tokenisation function
def tokenisation(reviews, allowed_postags=["NOUN", "ADJ", "VERBS", "ADV"]):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    reviews_out = []
    tokens = []

    for review in reviews:
        doc = nlp(review) 
        reviews_out.append(" ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in stop_words]))
    
    for text in reviews_out:
        new = gensim.utils.simple_preprocess(text, deacc=False) # We do not remove the accent marks because we deem them important for French restaurants reviews
        tokens.append(new)

    return tokens

# Preprocessing function
def preprocessing(text):
    # Corrected spelling on lower case text
    corrected_text = str(TextBlob(text.lower()).correct())

    # Translation
    cleaned_text = translate_text(str(corrected_text))

    return cleaned_text

# Apply preprocessing and tokenisation
df['cleaned_text'] = df['text'].apply(preprocessing)
df['tokens'] = tokenisation(df['text'])

In [39]:
# Summarisation
summariser = pipeline("summarization", model="facebook/bart-large-cnn")
summarised_text = df['text'].apply(lambda x: summariser(x, max_length=round(len(x)/max_length_coef), min_length=round(len(x)/min_length_coef), do_sample=False))
df['summarised_text'] = summarised_text.apply(lambda x: x[0]['summary_text'])

Your max_length is set to 106, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 105, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 103, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 106, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your

In [None]:
df

Unnamed: 0,text,rating,location,tokens,cleaned_text
0,Robyn gave amazing service! So attentive and f...,5,Los Angeles,"[amazing, service, attentive, friendly, stuff,...",robin gave amazing service! so attentive and f...
1,Headed downtown on a Thursday evening for a Ki...,5,Los Angeles,"[downtown, evening, king, game, time, dinner, ...",headed downtown on a thursday evening for a ki...
2,"Been here a few times, in just recent weeks. T...",4,Los Angeles,"[time, recent, week, visit, rooftop, bar, time...","been here a few times, in just recent weeks. t..."
3,Service is fast. Staff is friendly. The food i...,5,Los Angeles,"[service, fast, staff, friendly, food, whole, ...",service is fast. staff is friendly. the food i...
4,Walked by and asked to see a menu. Very helpfu...,3,Los Angeles,"[menu, helpful, staff, concept, try, ambience,...",walked by and asked to see a menu. very helpfu...
5,My husband and I had a fabulous dining experie...,5,Los Angeles,"[husband, fabulous, dining, experience, vibe, ...",my husband and i had a fabulous dining experie...
6,"Morgen was literally amazing, top tier food, t...",5,Los Angeles,"[literally, amazing, top, tier, food, top, tie...","morgen was literally amazing, top tier food, t..."
7,ADKT in West Hollywood is an absolute gem that...,5,Los Angeles,"[adkt, absolute, gem, beautifully, essence, pa...",at in west hollywood is an absolute gem that b...
8,I enjoyed our time at ADKT and the food was de...,3,Los Angeles,"[time, food, decent, thing, dark, personally, ...",i enjoyed our time at at and the food was dece...
9,This a charming family run restaurant. The own...,5,Los Angeles,"[charming, family, restaurant, owner, also, ch...",this a charming family run restaurant. the own...


In [None]:
df.to_csv('yelp_reviews.csv', index=False)

In [35]:
# ---------- Highlighting frequent words ---------- #

# Word Frequency Analysis
all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)

# N-gram Analysis
bigrams = ngrams(all_words, 2)
bigram_freq = Counter(bigrams)

# Tri-gram Analysis
trigrams = ngrams(all_words, 3)
trigram_freq = Counter(trigrams)

# ---------- Topic Modelling ---------- #

# We convert the tokens into tuples where we'll have the word index (its placement on the map) and its frequency
id2word = corpora.Dictionary(df['tokens'])
corpus = [id2word.doc2bow(text) for text in df['tokens']]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', R=10)
pyLDAvis.display(vis)



In [36]:
import streamlit as st

def display_topics(model, num_topics):
    for i in range(num_topics):
        words = model.show_topic(i)
        st.write(f"Topic {i+1}:")
        st.write(", ".join([word[0] for word in words]))

st.title("Restaurant Review Analysis")

st.header("Review Topics")
display_topics(lda_model, 10)

2024-01-17 00:39:43.685 
  command:

    streamlit run /Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


A faire: Faire choisir a l'utilisateur un ou des topics qu'il aimerait aborder. (S'il est interesse par le food quality par exemple on lui recommandera un restaurant avec une tres bonne food quality - peux-t-on recuperer les restaurants qui correspondent aux avis?).

Montrer aussi les topics principaux les plus importants pour les resturants francais de chaque ville

Later on, maybe use these topics to enhance the importance of them in summarised reviews.

What I suggest is this: Topics principaux par villes. On demande a l'utilisateur de choisir ce qu'il prefere a travers un chatbot et ensuite on trouve les restaurants avec les meilleurs topics et de ce resturant on montre aussi ses meilleurs atouts avec les topics.