## 1. Preprocessing

1.1. Setup and data loading

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import networkx as nx
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
from collections import defaultdict, Counter
import os

In [3]:
reviews_df = pd.read_csv('C:/Users/msard/OneDrive/Desktop/Data Science/Fall 2024/Text Mining/Hyderabadi-Word-Soup/data_hyderabad/10k_reviews.csv')
restaurants_df = pd.read_csv('C:/Users/msard/OneDrive/Desktop/Data Science/Fall 2024/Text Mining/Hyderabadi-Word-Soup/data_hyderabad/105_restaurants.csv')

#print(reviews_df.head())
#print(restaurants_df.head())

In [None]:
def cuisines_word_cloud_generator(folder_path,two_towers_df,wc,vectorisation="bow"):

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for idx in range(len(two_towers_df)):
        part_chapter_str = lambda x :\
            str("Part 1_{}_".format(two_towers_df["chapter_number"].iloc[x]))\
                if x < 9 else\
                    str("Part 2_{}_".format(two_towers_df["chapter_number"].iloc[x]))
        
        if vectorisation=="bow":
            ##Bow visualization
            chapter_bow_df = word_freq_calculator([two_towers_df["chapter_bow_vector"].iloc[idx]],ttt_bow_word_list, df_output=False)
            wc.generate_from_frequencies(chapter_bow_df) ##Use color_func in word cloud?
            wc.to_file(os.path.join(folder_path, "WC_"+part_chapter_str(idx)+"BOW.png"))
        else:
            ##TFIDF visualization
            chapter_tfidf_df = word_freq_calculator([two_towers_df["chapter_tfidf_vector"].iloc[idx]],ttt_tfidf_word_list, df_output=False)
            wc.generate_from_frequencies(chapter_tfidf_df)
            wc.to_file(os.path.join(folder_path, "WC_"+part_chapter_str(idx)+"TFIDF.png"))

In [None]:
wc = WordCloud(background_color="white",max_words=120, width = 220,height = 220, color_func=lambda *args, **kwargs: (0,0,0))

In [None]:
ttt_word_cloud_generator("two_towers_chapters",two_towers_df,wc,"bow")

ttt_word_cloud_generator("two_towers_chapters",two_towers_df,wc,"tfidf")

1.2. Named Entity Recognition for dish names

In [None]:
def pos_tag_integrator(two_towers_df, df_output=False):
    
    pos_dict = defaultdict(list)

    for idx in range(len(two_towers_df)):
        pos_tuples_list_chapter = two_towers_df["chapter_pos"].iloc[idx]
        
        for pos_tuple in pos_tuples_list_chapter:
            
            pos_dict[pos_tuple[0].lower()].append(pos_tuple[1])
    
    for pos_dict_key in pos_dict.keys():
        counter_pos_list = Counter(pos_dict[pos_dict_key])
        pos_dict[pos_dict_key] = counter_pos_list.most_common(1)[0][0]

    if df_output==False:
        return dict(pos_dict)
    else:
        pos_dict_df = pd.DataFrame({"words":pos_dict.keys(), "most_common_pos_tag":pos_dict.values()})
        return pos_dict_df

## 2. Co-Occurence Matrix Analysis

In [None]:
ttt_sentences = sent_tokenizer.tokenize(two_towers_book)
two_towers_sentences_df = pd.DataFrame({"sentences":ttt_sentences,"preproc_sentences":[pipeline_v1c.main_pipeline(sentence,\
                                                                                         print_output=False, lemmatized=False,\
                                                                                              tokenized_output=True, custom_stopwords=[])\
                                                                                                  for sentence in ttt_sentences]})


In [None]:
def cooccurrence_matrix_sentence_generator(preproc_sentences):

    co_occurrences = defaultdict(Counter)

    # Compute co-occurrences
    for sentence in tqdm(preproc_sentences):
        for token_1 in sentence:
            for token_2 in sentence:
                if token_1 != token_2:
                    co_occurrences[token_1][token_2] += 1

    #ensure that words are unique
    unique_words = list(set([word for sentence in preproc_sentences for word in sentence]))

    # Initialize the co-occurrence matrix
    co_matrix = np.zeros((len(unique_words), len(unique_words)), dtype=int)

    # Populate the co-occurrence matrix
    word_index = {word: idx for idx, word in enumerate(unique_words)}
    for word, neighbors in co_occurrences.items():
        for neighbor, count in neighbors.items():
            co_matrix[word_index[word]][word_index[neighbor]] = count

    # Create a DataFrame for better readability
    co_matrix_df = pd.DataFrame(co_matrix, index=unique_words, columns=unique_words)

    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=1)
    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=0)

    # Return the co-occurrence matrix
    return co_matrix_df

In [None]:
def cooccurrence_matrix_window_generator(preproc_sentences, window_size):

    co_occurrences = defaultdict(Counter)

    # Compute co-occurrences
    for sentence in tqdm(preproc_sentences):
        for i, word in enumerate(sentence):
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if i != j:
                    co_occurrences[word][sentence[j]] += 1

    #ensure that words are unique
    unique_words = list(set([word for sentence in preproc_sentences for word in sentence]))

    # Initialize the co-occurrence matrix
    co_matrix = np.zeros((len(unique_words), len(unique_words)), dtype=int)

    # Populate the co-occurrence matrix
    word_index = {word: idx for idx, word in enumerate(unique_words)}
    for word, neighbors in co_occurrences.items():
        for neighbor, count in neighbors.items():
            co_matrix[word_index[word]][word_index[neighbor]] = count

    # Create a DataFrame for better readability
    co_matrix_df = pd.DataFrame(co_matrix, index=unique_words, columns=unique_words)

    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=1)
    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=0)

    # Return the co-occurrence matrix
    return co_matrix_df

In [None]:
def cooccurrence_network_generator(cooccurrence_matrix_df, n_highest_words, output=None):
    
    filtered_df = cooccurrence_matrix_df.iloc[:n_highest_words, :n_highest_words]
    graph = nx.Graph()

    # Add nodes for words and set their sizes based on frequency
    for word in filtered_df.columns:
        graph.add_node(word, size=filtered_df[word].sum())

    # Add weighted edges to the graph based on co-occurrence frequency
    for word1 in filtered_df.columns:
        for word2 in filtered_df.columns:
            if word1 != word2:
                graph.add_edge(word1, word2, weight=filtered_df.loc[word1, word2])

    figure = plt.figure(figsize=(14, 12))

    # Generate positions for the nodes
    pos = nx.spring_layout(graph, k=0.5)

    # Calculate edge widths based on co-occurrence frequency
    edge_weights = [0.1 * graph[u][v]['weight'] for u, v in graph.edges()]

    # Get node sizes based on the frequency of words
    node_sizes = [data['size'] * 2 for _, data in graph.nodes(data=True)]

    # Create the network graph
    nx.draw_networkx_nodes(graph, pos, node_color='skyblue', node_size=node_sizes)
    nx.draw_networkx_edges(graph, pos, edge_color='gray', width=edge_weights)
    nx.draw_networkx_labels(graph, pos, font_weight='bold', font_size=12)

    plt.show() 

    if output=="return":
        return figure

## 3. Clustering

3.1. Dimensionality reduction

3.2. Clustering algorithm

## 4. Cluster labelling

## 5. Visualization