In [1]:
# !pip install fastapi uvicorn nltk rapidfuzz pickle-mixin




In [1]:
from fastapi import FastAPI
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import pickle
from rapidfuzz import process
import json
import nest_asyncio 

import uvicorn
from fastapi.middleware.cors import CORSMiddleware



In [2]:
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#lower, remove punctuations , remove stop words , tokenize input
def filter_text(input_string , string , stop_words):
    tokenized_input = input_string.translate(str.maketrans('', '', string.punctuation)).lower().split()
    filterd_tokenized_input = [w for w in tokenized_input if not w in stop_words]
    filterd_tokenized_input = [ps.stem(lemmatizer.lemmatize(i)) for i in filterd_tokenized_input ]
    return filterd_tokenized_input


In [4]:
import glob
import os

In [5]:
#loading the bm25 model
with open(r"models\model.pkl", 'rb') as file:
    model = pickle.load(file)

# loading the flatten corpus for the fuzzy search
with open('flatten_corpus.json', 'r', encoding='utf-8') as json_file:
    flatten_corpus = json.load(json_file)

# loading the the pdfs jsons : results
with open('animes_data.json', 'r', encoding='utf-8') as json_file:
    animes_json = json.load(json_file)

In [6]:
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # React app URL
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [7]:
doc_names = [anime['doc_name'] for anime in animes_json]


In [8]:
@app.get("/search")
def anime_search(query : str):


    tokenized_query = query.translate(str.maketrans('', '', string.punctuation)).lower().split()

    fuzzy_tokenized_query_list = []

    for q in tokenized_query:
        if len(q) <= 2:
            q=q
        else :
            print(process.extract(q,flatten_corpus,limit=3))
            fuzzy_query = process.extractOne(q, flatten_corpus)
            q = fuzzy_query[0] if fuzzy_query[1] > 79 else q
            
        fuzzy_tokenized_query_list.append(q)

    fuzzy_tokenized_query = ' '.join(fuzzy_tokenized_query_list)
    print(fuzzy_tokenized_query)

    fuzzy_tokenized_cleaned_query = filter_text(fuzzy_tokenized_query , string , stop_words)
    print(fuzzy_tokenized_cleaned_query)
    
    result = model.get_top_n(fuzzy_tokenized_cleaned_query , doc_names, n = 100)
    print(result)

    return {'correct_query':fuzzy_tokenized_query,'results':result}


In [None]:

nest_asyncio.apply()
uvicorn.run(app, port=8000)

INFO:     Started server process [7540]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


[('one', 100.0, 153), ('throne', 90.0, 158), ('abandoned', 90.0, 231)]
[('pieces', 90.0, 6587), ('piece', 88.88888888888889, 2366), ('piccolo', 77.14285714285715, 2634)]
one pieces
['one', 'piec']
[{'doc': 'anime262.pdf', 'score': 6.867176974509181}, {'doc': 'anime378.pdf', 'score': 6.349295862759902}, {'doc': 'anime394.pdf', 'score': 6.2660444123825885}, {'doc': 'anime125.pdf', 'score': 5.8654337318737095}, {'doc': 'anime151.pdf', 'score': 4.372291802062692}, {'doc': 'anime209.pdf', 'score': 3.9461168031520586}, {'doc': 'anime66.pdf', 'score': 3.7556304376401712}, {'doc': 'anime344.pdf', 'score': 3.4798839316152295}, {'doc': 'anime89.pdf', 'score': 3.3688617540258785}, {'doc': 'anime363.pdf', 'score': 3.2872902326076545}, {'doc': 'anime269.pdf', 'score': 3.264704653685462}, {'doc': 'anime358.pdf', 'score': 3.2314022591948017}]
INFO:     127.0.0.1:49760 - "GET /search?query=one%20piec HTTP/1.1" 200 OK
[('one', 100.0, 153), ('throne', 90.0, 158), ('abandoned', 90.0, 231)]
[('pieces', 10