Objective: What dishes are mentioned together in the reviews? Do they form clusters? Can you identify cuisine types based on those clusters? 

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import networkx as nx
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import os

from wordcloud import WordCloud

import networkx as nx
from matplotlib.colors import ListedColormap
import plotly.express as px
import plotly.graph_objects as go
import circlify as circ

import nltk
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize.treebank import TreebankWordDetokenizer
from unidecode import unidecode

sns.set_context(font_scale=1.2, context='paper')

import pycirclize

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\msard\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
reviews = pd.read_csv('C:/Users/msard/OneDrive/Desktop/Data Science/Fall 2024/Text Mining/Hyderabadi-Word-Soup/data_hyderabad/10k_reviews.csv')
restaurants = pd.read_csv('C:/Users/msard/OneDrive/Desktop/Data Science/Fall 2024/Text Mining/Hyderabadi-Word-Soup/data_hyderabad/105_restaurants.csv')

#print(reviews.info())
#print(restaurants.info())

Join the needed columns

In [4]:
# Drops unnecessary columns and null rows
reviews.drop(['Reviewer', 'Metadata', 'Time', 'Pictures', 'Rating'], axis=1, inplace=True)
reviews.dropna(subset=['Review'], inplace=True)

restaurants.drop(['Links', 'Cost', 'Collections', 'Timings'], axis=1, inplace=True)
restaurants.dropna(subset=['Name', 'Cuisines'], inplace=True)

# Merges the cuisines column with the reviews
reviews = reviews.merge(restaurants[['Name', 'Cuisines']], 
                                      left_on='Restaurant', right_on='Name', 
                                      how='left').drop(columns=['Name'])

reviews.drop(['Restaurant'], axis=1, inplace=True)

reviews.head()

Unnamed: 0,Review,Cuisines
0,"The ambience was good, food was quite good . h...","Chinese, Continental, Kebab, European, South I..."
1,Ambience is too good for a pleasant evening. S...,"Chinese, Continental, Kebab, European, South I..."
2,A must try.. great food great ambience. Thnx f...,"Chinese, Continental, Kebab, European, South I..."
3,Soumen das and Arun was a great guy. Only beca...,"Chinese, Continental, Kebab, European, South I..."
4,Food is good.we ordered Kodi drumsticks and ba...,"Chinese, Continental, Kebab, European, South I..."


In [5]:
# Split Cuisines into lists
reviews['Cuisines'] = reviews['Cuisines'].str.split(', ')

# Step 1: Find all unique cuisines
unique_cuisines = sorted(set(cuisine for sublist in reviews['Cuisines'] for cuisine in sublist))

# Step 2: Create a binary vector for each row
def create_binary_vector(cuisines, all_cuisines):
    return [1 if cuisine in cuisines else 0 for cuisine in all_cuisines]

# Step 3: Apply the function to each row and store it in the 'Cuisines' column
reviews['Cuisines'] = reviews['Cuisines'].apply(lambda x: create_binary_vector(x, unique_cuisines))

reviews.head()

Unnamed: 0,Review,Cuisines
0,"The ambience was good, food was quite good . h...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
1,Ambience is too good for a pleasant evening. S...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
2,A must try.. great food great ambience. Thnx f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
3,Soumen das and Arun was a great guy. Only beca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
4,Food is good.we ordered Kodi drumsticks and ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."


In [14]:
def regex_cleaner(raw_text, 
            no_emojis = True, 
            no_hashtags = True,
            hashtag_retain_words = True,
            no_newlines = True,
            no_urls = True,
            no_punctuation = True):
    
    #patterns
    newline_pattern = "(\\n)"
    hashtags_at_pattern = "([#\@@\u0040\uFF20\uFE6B])"
    hashtags_ats_and_word_pattern = "([#@]\w+)"
    emojis_pattern = "([\u2600-\u27FF])"
    url_pattern = "(?:\w+:\/{2})?(?:www)?(?:\.)?([a-z\d]+)(?:\.)([a-z\d\.]{2,})(\/[a-zA-Z\/\d]+)?" ##Note that this URL pattern is *even better*
    punctuation_pattern = "[\u0021-\u0026\u0028-\u002C\u002E-\u002F\u003A-\u003F\u005B-\u005F\u2010-\u2028\ufeff`]+"
    apostrophe_pattern = "'(?=[A-Z\s])|(?<=[a-z\.\?\!\,\s])'"
    separated_words_pattern = "(?<=\w\s)([A-Z]\s){2,}"
    ##note that this punctuation_pattern doesn't capture ' this time to allow our tokenizer to separate "don't" into ["do", "n't"]
    
    if no_emojis == True:
        clean_text = re.sub(emojis_pattern,"",raw_text)
    else:
        clean_text = raw_text

    if no_hashtags == True:
        if hashtag_retain_words == True:
            clean_text = re.sub(hashtags_at_pattern,"",clean_text)
        else:
            clean_text = re.sub(hashtags_ats_and_word_pattern,"",clean_text)
        
    if no_newlines == True:
        clean_text = re.sub(newline_pattern," ",clean_text)

    if no_urls == True:
        clean_text = re.sub(url_pattern,"",clean_text)
    
    if no_punctuation == True:
        clean_text = re.sub(punctuation_pattern,"",clean_text)
        clean_text = re.sub(apostrophe_pattern,"",clean_text)

    return clean_text

def lemmatize_all(token, list_pos=["n","v","a","r","s"]):
    
    wordnet_lem = nltk.stem.WordNetLemmatizer()
    for arg_1 in list_pos:
        token = wordnet_lem.lemmatize(token, arg_1)
    return token

def main_pipeline(raw_text, 
                  print_output = True, 
                  no_stopwords = True,
                  custom_stopwords = [],
                  convert_diacritics = True, 
                  lowercase = True, 
                  lemmatized = True,
                  list_pos = ["n","v","a","r","s"],
                  stemmed = False, 
                  pos_tags_list = "no_pos",
                  tokenized_output = False,
                  **kwargs):
    
    """Preprocess strings according to the parameters"""

    clean_text = regex_cleaner(raw_text, **kwargs)
    tokenized_text = nltk.tokenize.word_tokenize(clean_text)

    tokenized_text = [re.sub("'m","am",token) for token in tokenized_text]
    tokenized_text = [re.sub("n't","not",token) for token in tokenized_text]
    tokenized_text = [re.sub("'s","is",token) for token in tokenized_text]

    if no_stopwords == True:
        stopwords = nltk.corpus.stopwords.words("english")
        tokenized_text = [item for item in tokenized_text if item.lower() not in stopwords]
    
    if convert_diacritics == True:
        tokenized_text = [unidecode(token) for token in tokenized_text]

    if lemmatized == True:
        tokenized_text = [lemmatize_all(token, list_pos=list_pos) for token in tokenized_text]
    
    if stemmed == True:
        porterstemmer = nltk.stem.PorterStemmer()
        tokenized_text = [porterstemmer.stem(token) for token in tokenized_text]
 
    if no_stopwords == True:
        tokenized_text = [item for item in tokenized_text if item.lower() not in custom_stopwords]

    if pos_tags_list == "pos_list" or pos_tags_list == "pos_tuples" or pos_tags_list == "pos_dictionary":
        pos_tuples = nltk.tag.pos_tag(tokenized_text)
        pos_tags = [pos[1] for pos in pos_tuples]
    
    if lowercase == True:
        tokenized_text = [item.lower() for item in tokenized_text]

    if print_output == True:
        print(raw_text)
        print(tokenized_text)
    
    if pos_tags_list == "pos_list":
        return (tokenized_text, pos_tags)
    elif pos_tags_list == "pos_tuples":
        return pos_tuples   
    
    else:
        if tokenized_output == True:
            return tokenized_text
        else:
            detokenizer = TreebankWordDetokenizer()
            detokens = detokenizer.detokenize(tokenized_text)
            return str(detokens)

In [18]:
reviews["Preproc_Review"] =\
      reviews["Review"].apply(lambda content :\
                                                  main_pipeline(content, 
                                                                  print_output=False,
                                                                  lemmatized=False,
                                                                  lowercase=True,
                                                                        tokenized_output=False),
                                                                  )

reviews["Review_PoS"] =\
      reviews["Review"].apply(lambda content :\
                                                  main_pipeline(content, 
                                                                  print_output=False,
                                                                  lemmatized=False,
                                                                  lowercase=False,
                                                                  pos_tags_list="pos_tuples"),
                                                                  )
reviews.head()

Unnamed: 0,Review,Cuisines,Preproc_Review,Review_PoS
0,"The ambience was good, food was quite good . h...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",ambience good food quite good saturday lunch c...,"[(ambience, RB), (good, JJ), (food, NN), (quit..."
1,Ambience is too good for a pleasant evening. S...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",ambience good pleasant evening service prompt ...,"[(Ambience, RB), (good, JJ), (pleasant, NN), (..."
2,A must try.. great food great ambience. Thnx f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",must try great food great ambience thnx servic...,"[(must, MD), (try, VB), (great, JJ), (food, NN..."
3,Soumen das and Arun was a great guy. Only beca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",soumen das arun great guy behavior sincerety g...,"[(Soumen, NNP), (das, NNS), (Arun, NNP), (grea..."
4,Food is good.we ordered Kodi drumsticks and ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",food ordered kodi drumsticks basket mutton bir...,"[(Food, NN), (ordered, VBD), (Kodi, NNP), (dru..."


In [19]:
bow_vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b")
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b")

In [20]:
reviews_bow_td_matrix = bow_vectorizer.fit_transform(reviews["Preproc_Review"]).toarray()
reviews["Review_bow_vector"] = reviews_bow_td_matrix.tolist()
reviews_bow_word_list = bow_vectorizer.get_feature_names_out()
reviews.head()

Unnamed: 0,Review,Cuisines,Preproc_Review,Review_PoS,Review_bow_vector
0,"The ambience was good, food was quite good . h...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",ambience good food quite good saturday lunch c...,"[(ambience, RB), (good, JJ), (food, NN), (quit...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Ambience is too good for a pleasant evening. S...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",ambience good pleasant evening service prompt ...,"[(Ambience, RB), (good, JJ), (pleasant, NN), (...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A must try.. great food great ambience. Thnx f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",must try great food great ambience thnx servic...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Soumen das and Arun was a great guy. Only beca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",soumen das arun great guy behavior sincerety g...,"[(Soumen, NNP), (das, NNS), (Arun, NNP), (grea...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Food is good.we ordered Kodi drumsticks and ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",food ordered kodi drumsticks basket mutton bir...,"[(Food, NN), (ordered, VBD), (Kodi, NNP), (dru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [22]:
reviews_tfidf_td_matrix = tfidf_vectorizer.fit_transform(reviews["Preproc_Review"])
reviews["Review_tfidf_vector"] = [row for row in reviews_tfidf_td_matrix]
reviews_tfidf_word_list = tfidf_vectorizer.get_feature_names_out()

reviews.head()

Unnamed: 0,Review,Cuisines,Preproc_Review,Review_PoS,Review_bow_vector,Review_tfidf_vector
0,"The ambience was good, food was quite good . h...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",ambience good food quite good saturday lunch c...,"[(ambience, RB), (good, JJ), (food, NN), (quit...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(0, 8031)\t0.0567096946353229\n (0, 67501)\..."
1,Ambience is too good for a pleasant evening. S...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",ambience good pleasant evening service prompt ...,"[(Ambience, RB), (good, JJ), (pleasant, NN), (...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(0, 8031)\t0.08084883167553647\n (0, 67501)..."
2,A must try.. great food great ambience. Thnx f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",must try great food great ambience thnx servic...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(0, 8031)\t0.06303637505472821\n (0, 59669)..."
3,Soumen das and Arun was a great guy. Only beca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",soumen das arun great guy behavior sincerety g...,"[(Soumen, NNP), (das, NNS), (Arun, NNP), (grea...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(0, 67501)\t0.0488960604396863\n (0, 59669)..."
4,Food is good.we ordered Kodi drumsticks and ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",food ordered kodi drumsticks basket mutton bir...,"[(Food, NN), (ordered, VBD), (Kodi, NNP), (dru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(0, 8031)\t0.07412009791971721\n (0, 67501)..."


In [23]:
def word_freq_calculator(td_matrix, word_list, df_output=True):
    word_counts = np.sum(td_matrix, axis=0).tolist()
    if df_output == False:
        word_counts_dict = dict(zip(word_list, word_counts))
        return word_counts_dict
    else:
        word_counts_df = pd.DataFrame({"words":word_list, "frequency":word_counts})
        word_counts_df = word_counts_df.sort_values(by=["frequency"], ascending=False)
        return word_counts_df
    
def plot_term_frequency(df, nr_terms, df_name, show=True):
    plt.figure(figsize=(10, 8))
    sns_plot = sns.barplot(x='frequency', y='words', data=df.head(nr_terms))  # Plotting top 20 terms for better visualization
    plt.title('Top 20 Term Frequencies of {}'.format(df_name))
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    if show==True:
        plt.show()

    fig = sns_plot.get_figure()
    plt.close()

    return fig

In [None]:
reviews['Cuisines'].

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...
                              ...                        
9950    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
9951    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
9952    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
9953    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
9954    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
Name: Cuisines, Length: 9955, dtype: object

In [None]:
# Folder for saving review visualizations
folder_path = 'C:\Users\msard\OneDrive\Desktop\Data Science\Fall 2024\Text Mining\Hyderabadi-Word-Soup\coocurrence_clustering\visualizations'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Create directory for the visualizations
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Iterate through each cuisine type
for cuisine_idx, cuisine_name in enumerate(reviews['Cuisines'].unique().tolist()):
    
    # Filter reviews for this cuisine
    cuisine_reviews = reviews[reviews['Cuisines'].apply(lambda x: x[cuisine_idx] == 1)]
    
    # Aggregate the BoW vectors for the cuisine (summing over all reviews for this cuisine)
    if len(cuisine_reviews) > 0:
        cuisine_bow_vector_sum = np.sum(np.array(cuisine_reviews['Review_bow_vector'].tolist()), axis=0)
        cuisine_bow_df = word_freq_calculator([cuisine_bow_vector_sum], reviews_bow_word_list)
        
        # Plot and save the BoW visualization
        cuisine_bow_plot = plot_term_frequency(
            cuisine_bow_df, 20,
            f"{cuisine_name} Cuisine - 20 Most Common Words (BoW)",
            show=False
        )
        cuisine_bow_plot.savefig(os.path.join(folder_path, f"{cuisine_name}_BOW.png"))

        # Aggregate the TF-IDF vectors for the cuisine (summing over all reviews for this cuisine)
        cuisine_tfidf_vector_sum = np.sum(np.array(cuisine_reviews['Review_tfidf_vector'].tolist()), axis=0)
        cuisine_tfidf_df = word_freq_calculator([cuisine_tfidf_vector_sum], reviews_tfidf_word_list)
        
        # Plot and save the TF-IDF visualization
        cuisine_tfidf_plot = plot_term_frequency(
            cuisine_tfidf_df, 20,
            f"{cuisine_name} Cuisine - 20 Most Relevant Words (TF-IDF)",
            show=False
        )
        cuisine_tfidf_plot.savefig(os.path.join(folder_path, f"{cuisine_name}_TFIDF.png"))

# Display completion message
print(f"Saved BoW and TF-IDF visualizations for each cuisine type in '{folder_path}' folder.")
