In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import pandas as pd
from tqdm import tqdm

In [7]:
class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [8]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

In [9]:
df_train.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [10]:
print("Train set size: ", df_train.shape[0])
print("Test set size: ", df_test.shape[0])

Train set size:  650000
Test set size:  50000


In [19]:
df_train_sub = df_train.iloc[:1000]

In [46]:
bm25 = BM25()
bm25.fit(df_train_sub["text"][1:])
indices = bm25.transform(df_train_sub["text"][0], df_train_sub["text"][1:]).argsort()[-11:][::-1]

In [49]:
print(df_train_sub["text"][1:][:11])

1     Unfortunately, the frustration of being Dr. Go...
2     Been going to Dr. Goldberg for over 10 years. ...
3     Got a letter in the mail last week that said D...
4     I don't know what Dr. Goldberg was like before...
5     Top notch doctor in a top notch practice. Can'...
6     Dr. Eric Goldberg is a fantastic doctor who ha...
7     I'm writing this review to give you a heads up...
8     Wing sauce is like water. Pretty much a lot of...
9     Decent range somewhat close to the city.  The ...
10    Owning a driving range inside the city limits ...
11    This place is absolute garbage...  Half of the...
Name: text, dtype: object


In [48]:
print(df_train_sub["text"][0])
print(indices)
for index in indices:
    print()
    print(index, df_train_sub["text"][index])

dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.
[  5   1   3   4 941   6 345   0 620 464 589]

5 Top notch doctor in a top notch practice. Can't say I am surprised when I was referred to him by another doctor who I think is wonderful and because he went to one of the best medical schools in the country. \nIt is really easy to get an appointment. There is minimal wait to be seen and his bedside manner is great.

1 Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I'v

In [43]:
print(df_train_sub["text"][0])

dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.


In [33]:
for i in range(5):
    print(df_train_sub["text"][i+10])

Owning a driving range inside the city limits is like a license to print money.  I don't think I ask much out of a driving range.  Decent mats, clean balls and accessible hours.  Hell you need even less people now with the advent of the machine that doles out the balls.  This place has none of them.  It is april and there are no grass tees yet.  BTW they opened for the season this week although it has been golfing weather for a month.  The mats look like the carpet at my 107 year old aunt Irene's house.  Worn and thread bare.  Let's talk about the hours.  This place is equipped with lights yet they only sell buckets of balls until 730.  It is still light out.  Finally lets you have the pit to hit into.  When I arrived I wasn't sure if this was a driving range or an excavation site for a mastodon or a strip mining operation.  There is no grass on the range. Just mud.  Makes it a good tool to figure out how far you actually are hitting the ball.  Oh, they are cash only also.\n\nBottom li