In [1]:
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED,NUMERIC
from whoosh.analysis import StemmingAnalyzer,StandardAnalyzer
from whoosh import index
import os, os.path

In [14]:
class songDoc:
    def __init__(self, trackid, songid):
        self.trackid = trackid
        self.songid = songid
        self.title = ""
        self.artist = ""
        self.genre = ""
        self.tempo = 0
        self.artisthot = 0
        
    def setTitle(self, title):
        self.title = title

    def setTempo(self, tempo):
        self.tempo = tempo
        
    def setArtist(self, artist):
        self.artist = artist
        
    def setGenre(self, genre):
        self.genre = genre

In [28]:
import json
import glob

def loadDirectory(data_dir):    
    if not os.path.exists(data_dir):
        print("no json file exists")
    else:
        path = data_dir + '*.json'

    filenames = glob.glob(path)
    for filename in filenames:
        analyzeDocument(loadJason(filename))

def loadJason(filename):
    with open(filename, "r") as read_file:
        songs = json.load(read_file)
    return songs

def analyzeDocument(songs):
    for song in songs:
        doc = songDoc(song["track_id"], song["song_id"])
        doc.setTitle(song["title"])
        doc.setTempo(float(song["tempo"]))
        doc.setArtist(song["artist_name"])
        doc.setGenre(list(song["artist_terms"]))
        corpus.append(doc)

In [39]:
corpus = []
loadDirectory('data/')

In [40]:
schema = Schema(tempo=NUMERIC(float, stored=True),
                author=TEXT(analyzer=StandardAnalyzer(stoplist=None),stored=True),
                title=TEXT(analyzer=StandardAnalyzer(stoplist=None),stored=True),
                content=TEXT(analyzer=StandardAnalyzer(stoplist=None),stored=True))

In [64]:
schemaTest = Schema(tempo=NUMERIC(float, stored=True),
                author=TEXT(analyzer=StandardAnalyzer(stoplist=None),stored=True),
                title=TEXT(analyzer=StandardAnalyzer(stoplist=None),stored=True),
                content=KEYWORD(lowercase=True, commas=True,scorable=True))


In [6]:
#to create an index in a dictionary
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)
#open an existing index object
ix = index.open_dir("indexdir")
#create a writer object to add documents to the index
writer = ix.writer()
#now we can add documents to the index

In [65]:
#to create an index in a dictionary
if not os.path.exists("index5"):
    os.mkdir("index5")
ix = index.create_in("index5", schemaTest)
#open an existing index object
ix = index.open_dir("index5")
#create a writer object to add documents to the index
writer = ix.writer()
#now we can add documents to the index

In [66]:
for doc in corpus:
    writer.add_document(tempo=doc.tempo, author=doc.artist, title=doc.title, content=doc.genre)
writer.commit()

In [79]:
parser=MultifieldParser(["tempo", "content"], schema=schemaTest, group=OrGroup)
parser.add_plugin(GtLtPlugin()) 
result=parser.parse(u"I want some hardcore music, tempo:>140")
print(result)

(<_NullQuery> OR content:i OR <_NullQuery> OR content:want OR <_NullQuery> OR content:some OR <_NullQuery> OR content:hardcore OR <_NullQuery> OR content:music OR tempo:{140.0 TO ])


In [80]:
from whoosh import scoring

with ix.searcher(weighting=scoring.BM25F) as searcher:
    results=searcher.search(result, limit=8)
    print (results)
    print (results[0])

<Top 8 Results for Or([<_NullQuery>, Term('content', 'i'), <_NullQuery>, Term('content', 'want'), <_NullQuery>, Term('content', 'some'), <_NullQuery>, Term('content', 'hardcore'), <_NullQuery>, Term('content', 'music'), NumericRange('tempo', 140.0, None, True, False, boost=1.0, constantscore=True)]) runtime=0.5828746376155323>
<Hit {'author': 'Xmilk', 'tempo': 146.454, 'title': 'Why'}>
