In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import pandas as pd
from tqdm import tqdm

In [2]:
class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [3]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df_train.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [5]:
print("Train set size: ", df_train.shape[0])
print("Test set size: ", df_test.shape[0])

Train set size:  650000
Test set size:  50000


In [6]:
# df_train_sub = df_train.iloc[:1000]
df_train_sub = df_train.sample(n=1000, random_state=43).reset_index()
df_train_sub.head()

Unnamed: 0,index,label,text
0,623727,0,If you are looking for a place where you are t...
1,82720,2,"Grab a coffee, and window shop. People watch. ..."
2,545814,0,I called to get a quote for my dental procedur...
3,304799,2,This place is fine for a quick stop on your wa...
4,220726,2,After remembering coming to Archi's and having...


In [7]:
bm25 = BM25()
bm25.fit(df_train_sub["text"][1:])
indices = bm25.transform(df_train_sub["text"].iloc[0], df_train_sub["text"]).argsort()[-11:][::-1]

In [15]:
bm25 = BM25()
bm25.fit(df_train_sub["text"][1:])
indices = bm25.transform("Food is great just wish it was bigger and you didnt have to call to make a reservation", df_train_sub["text"]).argsort()[-11:][::-1]

In [8]:
# print(df_train_sub["text"][1:][:11])
print(len(df_train_sub["text"][1:]))
print(len(bm25.transform(df_train_sub["text"][0], df_train_sub["text"])))

999
1000


In [18]:
# print(df_train_sub["text"][0])
print("Food is great just wish it was bigger and you didnt have to call to make a reservation")
print(indices)
for index in indices[1:4]:
    print()
    print(index, df_train_sub["text"][index])

Food is great just wish it was bigger and you didnt have to call to make a reservation
[  5 732  51 842 134 596 763 851 712 563 944]

732 works like a strip bar...... we were greeted to sit down on a table of five, after we ordered food we were rudely asked to switch tables, we didnt mind so we asked to be seated in a bigger table since our options were to move to a much smaller table.\n\nThe owner came over and was worse than his own staff, and demanded we move now since a big shot is coming...\n\nAgain we said we didnt mind we just needed a bigger table lol... The owner then aggressively told me this table was a mistake by our staff for you to sit here, this was one hour later of us sitting. He then snapped hookah from me and asked me and my group to get the f**k out.\n\nSo we did politely without an issue, the arrogance of this place left a bad taste in my mouth.\n\nBasically if you dont know the owner, dont go! or else someone who knows the owner will be taken care of instead of yo

In [10]:
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns



tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_sub['text'])
normalized_tfidf = normalize(tfidf_matrix)


In [11]:
n_clusters = 7  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=43, n_init=10)
cluster_labels = kmeans.fit_predict(normalized_tfidf)

In [12]:
# df_pca = pd.DataFrame(df_train_sb, columns=['Cluster'])
df_train_sub["Cluster"] = cluster_labels
# df_pca['Cluster'] = cluster_labels
df_train_sub.head()


Unnamed: 0,index,label,text,Cluster
0,623727,0,If you are looking for a place where you are t...,4
1,82720,2,"Grab a coffee, and window shop. People watch. ...",2
2,545814,0,I called to get a quote for my dental procedur...,0
3,304799,2,This place is fine for a quick stop on your wa...,2
4,220726,2,After remembering coming to Archi's and having...,4


In [13]:
cluster_bm25 = BM25()

In [14]:
for cluster in range(n_clusters):
    sub_df = df_train_sub[df_train_sub['Cluster'] == cluster]
    cluster_bm25.fit(df_train_sub["text"][1:])
    indices = cluster_bm25.transform(sub_df["text"].iloc[0], sub_df["text"]).argsort()[-4:][::-1]
    print("Cluster:", cluster)
    print("Query:", sub_df["text"].iloc[0])

    for index in indices[1:]:
        print(index, df_train_sub["text"][index], "\n")

Cluster: 0
Query: I called to get a quote for my dental procedures that I need to be done since I already went to my regular dentist here in California and I know what I need to be done and I just want to compare the prices of my out of pocket cost. She replied and told me to call my insurance and they'll be able to help me. Meanwhile I informed her that I actually called other dental office around that area and ask the same thing and they're able to give me an estimate cost. And her answer to that is \"that's fine and you have a good day\". No effort of any kind of customer service. I guess they don't need more patients.
11 Tore this place down after a Cosmopolitan Pool Party!\n\nMet some nice ladies and went in on everything on the menu.\n\nDropped about $60 on grub, but had to find seating in another area.\n\nThe food was good and definitely, hit the spot after a day of fun in the sun at the pool.\n\nWould definitely try again and hopefully, they can get the line situation sorted ou