In [21]:
import re
import spacy
import pandas as pd
import pymupdf
from sentence_transformers import SentenceTransformer

In [5]:
nlp = spacy.blank('en')

nlp.add_pipe('sentencizer')

nlp.pipe_names

['sentencizer']

In [6]:
doc = pymupdf.open("Nigeria Study_1.pdf")  # or pymupdf.Document(filename)

In [11]:
def extract_doc(doc):
    extracted_doc = []
    for page in doc.pages(13):
        if page != "":
            page = re.sub("-", "", page.get_text())
            extracted_doc.append("".join(page))
    extracted_doc = " ".join(extracted_doc)
    extracted_doc = re.sub('\n', ' ', extracted_doc)
    extracted_doc = re.sub('\'', '', extracted_doc)
    return extracted_doc

In [15]:
texts = extract_doc(doc)

In [18]:
def sentencize(texts):
    content = []
    doc = nlp(texts)
    for sentence in doc.sents:
        content.append(f'{sentence.text}')
    return content

In [19]:
corpus = sentencize(texts)

In [40]:
corpus

['Preface Like its predecessor, this study is an attempt to treat in a con cise and objective manner the dominant historical, social, politi cal, economic, and military aspects of contemporary Nigerian society.',
 'Sources of information included scholarly journals and monographs, official reports of government and international or ganizations, newspapers, and numerous periodicals.',
 'Chapter bib liographies appear at the end of the book; brief comments on some of the more valuable sources suggested as possible further reading appear at the end of each chapter.',
 'Measurements are given in the metric system; a conversion table is provided to assist those read ers who are unfamiliar with metric measurements (see table 1, Ap pendix).',
 'A glossary is also included.',
 'Placenames generally have been spelled in accordance with those established by the United States Board on Geographic Names and the Permanent Committee on Geographic Names for British Offi cial Use, known as the BGN/PCGN

In [22]:
def embedding(text):
    model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
    model = SentenceTransformer(model_name)
    return model.encode(text, normalize_embeddings=True)

In [23]:
sentence_embedding = embedding(corpus)

In [25]:
sentence_embedding

array([[-0.0700842 ,  0.07069567, -0.04584361, ...,  0.03450798,
         0.02135629, -0.00060679],
       [-0.01845901,  0.03976664, -0.05933154, ..., -0.06205835,
         0.04273145,  0.03247267],
       [-0.00029837, -0.02355087, -0.03180637, ...,  0.0383227 ,
        -0.04792186,  0.01787745],
       ...,
       [-0.05692233,  0.06067333, -0.04932901, ..., -0.02590836,
         0.03329085,  0.0127798 ],
       [-0.01498537,  0.05015457, -0.03913689, ..., -0.04277162,
         0.09231941,  0.0032074 ],
       [ 0.09971478,  0.11400615, -0.02387581, ..., -0.02298126,
         0.0026984 , -0.02640013]], dtype=float32)

In [26]:
dimension = sentence_embedding.shape[1]

dimension

384

In [27]:
import faiss

In [28]:
index = faiss.IndexFlatIP(dimension)

In [29]:
index.add(sentence_embedding)

In [30]:
faiss.write_index(index, "corpus.index")

print("Index saved successfully")

Index saved successfully


In [31]:
index = faiss.read_index('corpus.index')

In [32]:
index.ntotal

1670

In [33]:
index.is_trained

True

In [34]:
query = ['How many ethnic group does the country contained?']

In [35]:
query_embedding = embedding(query)

In [36]:
query_embedding

array([[ 1.78052083e-01,  9.32908505e-02, -1.27886888e-02,
         6.03084303e-02, -1.33875720e-02, -9.03379172e-03,
         5.66778257e-02, -1.38845101e-01, -1.84935071e-02,
         5.19247465e-02,  6.61651716e-02, -1.05273828e-01,
         8.94299056e-03, -8.05738419e-02, -6.71282411e-04,
         2.87527572e-02, -1.18928596e-01, -5.41677251e-02,
        -1.21114971e-02, -8.47883895e-02, -3.11741140e-02,
        -1.13685727e-01,  6.71629757e-02, -2.55119037e-02,
        -5.11649176e-02, -6.47216737e-02,  7.19656795e-02,
        -1.64644662e-02,  6.65255487e-02,  1.59121007e-02,
        -5.29159270e-02,  4.20882776e-02,  5.95598854e-02,
         9.75668579e-02, -4.78211083e-02, -9.96096507e-02,
        -4.11329046e-03, -3.81095707e-02,  3.40205356e-02,
         4.83734999e-03,  3.92682627e-02,  4.67146821e-02,
         8.37283656e-02,  1.27787516e-02,  5.31644784e-02,
         5.74128442e-02, -1.95813701e-02,  1.90446265e-02,
        -4.83482368e-02,  5.48775271e-02,  3.98638397e-0

In [37]:
%%time
k = 2
scores, indices = index.search(query_embedding, k)

CPU times: user 936 μs, sys: 600 μs, total: 1.54 ms
Wall time: 805 μs


In [38]:
scores, indices

(array([[0.76497084, 0.69002485]], dtype=float32), array([[171,  28]]))

In [41]:
for i in range(k):
    print(corpus[indices[0][i]])

The country contained between 250 and 400 ethnic groups (depending on the way they were defined), speaking about 400 lan guages.
Ethnic Groups: 250 to 400 or more recognized groups, many divid ed into subgroups of considerable social and political importance.
