In [1]:
import csv
import math
import pandas as pd

from collections import Counter

In [2]:
with open("data/bike-item-titles.txt") as f:
    r = csv.reader(f, delimiter=',', quotechar='"')
    docs = [ x[1] for i,x in enumerate(r) if i > 1 ]


In [3]:
def get_results5(qry, corpus, k1=1.5, b=0.75):
    idx = create_inverted_index(corpus)
    n = len(corpus)
    d = [len(x.split()) for x in corpus]
    d_avg = float(sum(d)) / len(d)                
    score = Counter()
    for term in qry.split():
        if term in idx:
            i = idf(term, idx, n)
            for doc in idx[term]:
                f = float(idx[term][doc])
                score[doc] += i * (( f * (k1 + 1) ) / (f + k1 * (1 - b + (b * (float(d[doc]) / d_avg)))))
        
    results=[]
    for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
        if x[1] > 0:
            # output [0] score, [1] doc_id
            results.append([x[1],x[0]])

    return results;

def create_inverted_index(corpus):
    idx={}
    for i, doc in enumerate(corpus):
        for word in doc.split():
            if word in idx:
                if i in idx[word]:
                    # Update document's frequency
                    idx[word][i] += 1
                else:
                    # Add document
                    idx[word][i] = 1
            else:
                # Add term
                idx[word] = {i:1}
    return idx

def idf(term, idx, n):
    return math.log( float(n) / (1 + len(idx[term])))
    


In [4]:
from bokeh.charts import output_notebook, Scatter, show
from bokeh.io import push_notebook
from bokeh.plotting import ColumnDataSource, figure
from bokeh.models import HoverTool, ColorMapper
from bokeh.palettes import YlOrRd9

print(YlOrRd9)

output_notebook()

results = get_results5('mountain bike', docs, k1=1.5, b=0.75)

x_vals = [float(x[0]) for x in results] 
y_vals = [len(docs[x[1]].split()) for x in results]
d_vals = [docs[x[1]] for x in results]

hover = HoverTool(
        tooltips=[
            ("desc", "@desc"),
        ]
    )

source = ColumnDataSource(data=dict(x=x_vals,y=y_vals,desc=d_vals))
p = figure()
p.add_tools(hover)
p.circle(x_vals, y_vals, size=10, color="orange", source=source)
show(p)


def update(qry, k1, b):
    results = get_results5(qry, docs, k1, b)
    x_vals = [float(x[0]) for x in results] 
    y_vals = [len(docs[x[1]].split()) for x in results]
    d_vals = [docs[x[1]] for x in results]
    source.data['x'] = x_vals
    source.data['y'] = y_vals
    source.data['desc'] = d_vals
    push_notebook()
    


['#800026', '#bd0026', '#e31a1c', '#fc4e2a', '#fd8d3c', '#feb24c', '#fed976', '#ffeda0', '#ffffcc']


In [5]:
from ipywidgets import interact

interact(update, qry='mountain bike', k1=(0.0,2.0,0.05), b=(0.0,1.0,0.05))