In [1]:
# Imports

import nltk
import requests
import time
import pandas as pd
import os
import re
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import streamlit as st

from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob
from collections import Counter
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from dotenv import find_dotenv, load_dotenv
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec
from sklearn.manifold import TSNE
from PIL import Image

load_dotenv()

# ---------- Parameters ---------- #

max_length_coef = 1.5
min_length_coef = 2

# ---------- Functions ---------- #

# ---------- Loading the dataset ---------- #

df = pd.read_csv('current_yelp_reviews.csv')[0:15]

df.drop_duplicates(inplace=True)
df.dropna(subset=['text', 'rating', 'location'], inplace=True)

# ---------- Preprocessing ---------- #

# Translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].astype(str)

# Check if text contains Chinese characters
def contains_chinese(text):
    return bool(re.search('[\u4e00-\u9fff]', text))

# Translation function (from Chinese to English)
def translate_text(text):
    if contains_chinese(text):
        return translator(text)[0]['translation_text']
    else:
        return text
    
# Lemmatisation & Tokenisation function
def tokenisation(reviews, allowed_postags=["NOUN", "ADJ", "VERBS", "ADV"]):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    reviews_out = []
    tokens = []

    for review in reviews:
        doc = nlp(review) 
        reviews_out.append(" ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in stop_words]))

    for text in reviews_out:
        new = gensim.utils.simple_preprocess(text, deacc=False)
        tokens.append(new)

    return tokens

# Preprocessing function
def preprocessing(text):
    # Corrected spelling
    corrected_text = str(TextBlob(text).correct())

    # Translation
    cleaned_text = translate_text(str(corrected_text))

    return cleaned_text

def tokenised_text(text):
    # Lower case
    lower_text = text.lower()

    # Lemmatization & Tokenisation
    tokens = tokenisation(lower_text)

    return tokens

# Apply preprocessing and tokenisation
df['cleaned_text'] = df['text'].apply(preprocessing)
df['tokens'] = df['cleaned_text'].apply(tokenised_text)

/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/huggingface_hub/inference/_text_generation.py:121: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  @validator("best_of")
/Users/alexandrecogordan/miniconda3/envs/tensorflow/lib/python3.10/site-packages/huggingface_hub/inference/_text_generation.py:140: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  @validator("repetition_penalty")
/Users/alexandrecogorda

                                                 text  rating     location  \
0   Robyn gave amazing service! So attentive and f...       5  Los Angeles   
1   Headed downtown on a Thursday evening for a Ki...       5  Los Angeles   
2   Been here a few times, in just recent weeks. T...       4  Los Angeles   
3   Service is fast. Staff is friendly. The food i...       5  Los Angeles   
4   Walked by and asked to see a menu. Very helpfu...       3  Los Angeles   
5   My husband and I had a fabulous dining experie...       5  Los Angeles   
6   Morgen was literally amazing, top tier food, t...       5  Los Angeles   
7   ADKT in West Hollywood is an absolute gem that...       5  Los Angeles   
8   I enjoyed our time at ADKT and the food was de...       3  Los Angeles   
9   This a charming family run restaurant. The own...       5  Los Angeles   
10  Ahhh... we left with our hearts and bellies so...       5  Los Angeles   
11  A Food Affair has a nice charm and the service...       4  L