In [1]:
from sentence_transformers import SentenceTransformer
import spacy
import re
import faiss

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def read_file(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
    return lines 

In [12]:
file_path = 'corpus/nigeria'

In [13]:
corpus = read_file(file_path)

In [14]:
corpus = "".join(corpus)

corpus

"Nigeria, the most populous country in Africa, has a rich and diverse history shaped by ancient civilizations, colonial rule, and its path to independence. The country, located in West Africa, is home to over 250 ethnic groups and has experienced significant historical events that have influenced its development. From the early kingdoms to modern governance, Nigeria’s history is a fascinating narrative of resilience, struggle, and growth.\n\nAncient Kingdoms and Civilizations\nLong before European contact, several powerful kingdoms and civilizations thrived in what is now Nigeria. The Nok civilization (1000 BC–300 AD) is one of the earliest known societies, famous for its advanced terracotta sculptures. Later, the Yoruba Kingdoms of Ife and Oyo, as well as the Benin Empire, became dominant in the southwestern region. In the north, the Kanem-Bornu Empire and the Hausa city-states developed as powerful trading hubs, engaging in commerce with North Africa and the Arab world.\n\nThe Influe

In [15]:
def preprocess(text):
    sentences = []
    doc = nlp(text)
    for sentence in doc.sents:
        sentence =re.sub("\n+", " ", sentence.text)
        sentences.append(f'{sentence}')
    return sentences

In [16]:
corpus = preprocess(corpus)

In [17]:
corpus

['Nigeria, the most populous country in Africa, has a rich and diverse history shaped by ancient civilizations, colonial rule, and its path to independence.',
 'The country, located in West Africa, is home to over 250 ethnic groups and has experienced significant historical events that have influenced its development.',
 'From the early kingdoms to modern governance, Nigeria’s history is a fascinating narrative of resilience, struggle, and growth. ',
 'Ancient Kingdoms and Civilizations Long before European contact, several powerful kingdoms and civilizations thrived in what is now Nigeria.',
 'The Nok civilization (1000 BC–300 AD) is one of the earliest known societies, famous for its advanced terracotta sculptures.',
 'Later, the Yoruba Kingdoms of Ife and Oyo, as well as the Benin Empire, became dominant in the southwestern region.',
 'In the north, the Kanem-Bornu Empire and the Hausa city-states developed as powerful trading hubs, engaging in commerce with North Africa and the Ara

In [18]:
len(corpus)

41

In [19]:
checkpoint = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"

model = SentenceTransformer(checkpoint)

In [20]:
def embedding(corpus: list):
    return model.encode(corpus, normalize_embeddings=True)

In [21]:
corpus_embedding = embedding(corpus)

In [28]:
corpus_embedding

array([[ 0.01632937, -0.02142715, -0.06251978, ...,  0.04031044,
        -0.01691707, -0.02999892],
       [ 0.12324904,  0.10927784, -0.045377  , ...,  0.04659493,
        -0.04074575, -0.02324787],
       [-0.02444647,  0.05335668, -0.04062165, ...,  0.02408938,
         0.00773834, -0.02998484],
       ...,
       [ 0.00820045,  0.10998163, -0.03618674, ..., -0.00829757,
        -0.03384005,  0.00085488],
       [-0.02651662,  0.0481002 ,  0.0410602 , ..., -0.00291672,
         0.00715947, -0.01870273],
       [ 0.0101514 , -0.00267725,  0.01663405, ..., -0.02849343,
         0.0103144 ,  0.01494583]], dtype=float32)

In [29]:
embedding_size = corpus_embedding.shape[1]

embedding_size

384

In [30]:
index = faiss.IndexFlatIP(embedding_size)

In [31]:
index.add(corpus_embedding)

In [32]:
index.ntotal

41

In [33]:
# write / save index embedding to file
faiss.write_index(index, "index/nigeria_history_embeddig.index")

print("✅ FAISS index saved successfully!")

✅ FAISS index saved successfully!


In [34]:
index = faiss.read_index("index/nigeria_history_embeddig.index")

In [35]:
index.ntotal

41

In [43]:
query = ["In the 19th century what led to a shift in European interests toward colonization and economic exploitation of Nigeria’s natural resources?"]

In [44]:
query_embedding = embedding(query)

In [45]:
query_embedding.shape

(1, 384)

In [46]:
k = 1 # no of search results

In [47]:
scores, indices = index.search(query_embedding, k)

In [48]:
answers=[]
for i in range(k):
    result = corpus[indices[0][i]]
    answers.append(result)

In [49]:
" ".join(answers)

'The abolition of the slave trade in the 19th century led to a shift in European interests toward colonization and economic exploitation of Nigeria’s natural resources. '