#Dependencies and Imports

In [1]:
%%capture
!pip install --upgrade pip
!pip install farm-haystack[colab,faiss]
!pip install gdown

In [2]:
from haystack.nodes import EmbeddingRetriever, SentenceTransformersRanker, PreProcessor
from sentence_transformers import SentenceTransformer, util
from haystack.document_stores import FAISSDocumentStore
import logging

#Data Preperation

In [5]:
%%capture
# !gdown 1su39fmAA_lBRfpHu55BRDlSwv3ow-gsx #Download wiki data database files for generative from google drive https://drive.google.com/file/d/1su39fmAA_lBRfpHu55BRDlSwv3ow-gsx/view?usp=sharing
# !unzip faiss.zip -d faiss/ #unzip the downloaded zipped data 

# Incase you don't want to use database file that contain already preprocessed and embedded data, and want to use raw wikipedia data, run the line below
!gdown 1su39fmAA_lBRfpHu55BRDlSwv3ow-gsx
!unzip wiki_data.zip -d ./

The below cell was used first time to create preprocess the raw wikipedia data and then create and fill document store with this.<br>
You don't have to run this cell if you are using the database files data that we are downloading in the cell above. If not run the cell below.

In [None]:
# Run these If you are using raw wikipedia data, or not using the database files data
import json

with open('./nlp_data (1).txt', 'r') as wikifile:
  wiki_data = json.load(wikifile)
  wikifile.close()

docs = list()
count=0
for article in wiki_data:
  docs.append({
      'content' : article,
      'meta' : {'name' : 'Article ' + str(count)}
  })
  count+=1

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    remove_substrings=["/n", "/n/n", "/n/n/n", "/n/n/n/n", "==>", "==\n\n\n==", "==\n", "\n\n\n==", "==", "/u", "=u", "u=", "/u==", "/==u", "==u"],
    split_by="word",
    split_length=384,
    split_respect_sentence_boundary=True,
)
docs2 = preprocessor.process(docs)

logging.getLogger("haystack").setLevel(logging.ERROR)

# If you don't already have the faiss database and index files for your data then run this code
!mkdir faiss
document_store = FAISSDocumentStore(sql_url='sqlite:///faiss/faiss_document_store.db', faiss_index_factory_str="Flat")
document_store.write_documents(docs2)

If using the database file no need to run the cell above, just run the cell below to load the document store from alrready stored database files that we downloaded from google drive.

In [None]:
# Otherwise just load the document store froma alrready stored database files
document_store = FAISSDocumentStore.load(index_path="./faiss/faiss_index_file", config_path="./faiss/faiss_index_file.json")

# Modeling

Initialize the retriever model and use it to create embedding vectors for our data.<br>
This model will be responsible for retrieving k-relevant documents to our query from our database

Run the comment part only if we using raw data and not the database files. Do note that it will take time since it will create embedding from scratch (Time aprox. 1 hour 50 mins)

In [None]:
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",
)

# Run these to create embedding, incase you created a new document store instead of loading with already stored database files.
# document_store.update_embeddings(retriever)
# document_store.save('./faiss/faiss_index_file')

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

def get_my_answer(query, documents):
  sentence_list = list()
  for doc in documents:
    doc = doc.content
    temp_list = list()
    starting_index = 0
    sentence_start = 0
    for i in range(starting_index, len(doc)):
      if doc[i] == '.' or i == len(doc)-1:
        if not doc[i-1].isnumeric():
          sentence_end = i+1
          temp_list.append({'text': doc[sentence_start:sentence_end]})
          sentence_start= sentence_end
    sentence_list.append(temp_list)
  
  paragraph_list = list()
  for i in range(len(sentence_list)):
    for j in range(len(sentence_list[i])):
      if j == 0 and j != len(sentence_list[i])-1:

        if len(sentence_list[i][j]['text']) <= 384 and (len(sentence_list[i][j]['text']) + len(sentence_list[i][j+1]['text'])) <= 1000:
          paragraph_list.append({'text' : sentence_list[i][j]['text'] + sentence_list[i][j+1]['text']})
        elif len(sentence_list[i][j]['text']) >= 384 or (len(sentence_list[i][j]['text']) + len(sentence_list[i][j+1]['text'])) >= 1000:
          paragraph_list.append({'text' : sentence_list[i][j]['text']})

      elif j != len(sentence_list[i])-1: 

        if len(sentence_list[i][j]['text']) <=384 and (len(sentence_list[i][j-1]['text']) + len(sentence_list[i][j]['text']) + len(sentence_list[i][j+1]['text'])) <= 1000:
          paragraph_list.append({'text' : sentence_list[i][j-1]['text'] + sentence_list[i][j]['text'] + sentence_list[i][j+1]['text']})
        elif len(sentence_list[i][j]['text']) >= 384 and (len(sentence_list[i][j-1]['text']) + len(sentence_list[i][j]['text'])) <= 1000:
          paragraph_list.append({'text' : sentence_list[i][j-1]['text'] + sentence_list[i][j]['text']})
        else:
           paragraph_list.append({'text' : sentence_list[i][j]['text']})
      
      elif len(sentence_list[i][j]['text']) >= 384 and (len(sentence_list[i][j-1]['text']) + len(sentence_list[i][j]['text'])) <= 1000:
        paragraph_list.append({'text' : sentence_list[i][j-1]['text'] + sentence_list[i][j]['text']})
      
      else:
        paragraph_list.append({'text' : sentence_list[i][j]['text']})


  for paragraph in paragraph_list:
    embeddings = model.encode([paragraph['text'], query])
    score = util.dot_score(embeddings[0], embeddings[1])
    paragraph['score'] = score


  highest_score = 0
  for paragraph in paragraph_list:
    if paragraph['score'] > highest_score:
      highest_score = paragraph['score']
      answer = paragraph['text']

  return answer

#Query

In [28]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [29]:
query = "Ming dynasty" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  Ming dynasty 
Answer : 
('The Ming dynasty (23 January 1368 – 25 April 1644), officially the Great '
 'Ming, founded by the peasant rebel leader Zhu Yuanzhang, known as the Hongwu '
 'Emperor, was an imperial dynasty of China. It was the successor to the Yuan '
 'dynasty and the predecessor of the short-lived Shun dynasty, which was in '
 'turn succeeded by the Qing dynasty.')


In [31]:
query = "What is Pakistan" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  What is Pakistan 
Answer : 
('Pakistan (Urdu: پَاکِسْتَان [ˈpaːkɪstaːn]), officially the Islamic Republic '
 'of Pakistan (اِسْلامی\n'
 "جَمْہُورِیَہ پَاکِسْتَان), is a country in South Asia. It is the world's "
 'fifth-most populous country, with a population of over 249 million people, '
 "and has the world's second-largest Muslim population, just behind Indonesia.")


In [27]:
query = "Who was the first president of USA" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  Who was the first president of USA 
Answer : 
('George Washington (February 22, 1732 – December 14, 1799) was an American '
 'military officer, statesman, and Founding Father who served as the first '
 'president of the United States from 1789 to 1797. Appointed by the '
 'Continental Congress as commander of the Continental Army, Washington led '
 'Patriot forces to victory in the American Revolutionary War and served as '
 'president of the Constitutional Convention of 1787, which created and '
 'ratified the Constitution of the United States and the American federal '
 'government.')


In [32]:
query = "How many people died in world war 2?" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  How many people died in world war 2? 
Answer : 
('World War II was the deadliest military conflict in history. An estimated '
 'total of 70–85 million people perished, or about 3% of the 2.3 billion '
 '(est.) people on Earth in 1940. Deaths directly caused by the war (including '
 'military and civilian fatalities) are estimated at 50–56 million, with an '
 'additional estimated 19–28 million deaths from war-related disease and '
 'famine.')


In [34]:
query = "When did roman empire end?" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  When did roman empire end? 
Answer : 
('= Fall of the Empire =\n'
 'By convention, the Western Roman Empire is deemed to have ended on 4 '
 'September 476, when Odoacer deposed Romulus Augustus, but the historical '
 'record calls this determination into question. Indeed, the deposition of '
 'Romulus Augustus received very little attention in contemporary times.')


In [36]:
query = "When was pakistan founded?" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  When was pakistan founded? 
Answer : 
('The Dominion of Pakistan (later the Islamic Republic of Pakistan) was '
 'founded in 1947 as a result of the independence of India from  British rule, '
 'when India was simultaneously partitioned to create Pakistan (in two '
 'non-contiguous halves called East Pakistan & West Pakistan). East Pakistan '
 'seceded in 1971 as a result of the Language Movement followed by the '
 'Bangladesh War of Independence, and West Pakistan has continued the '
 'Pakistani national identity since.')


In [37]:
query = "What is korea divided into?" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  What is korea divided into? 
Answer : 
('Korea (Korean: 한국, Hanguk or 조선, Joseon) is a peninsular region in East '
 'Asia. Since 1945, it has been divided at or near the 38th parallel, with '
 "North Korea (Democratic People's Republic of Korea) comprising its northern "
 'half and South Korea (Republic of Korea) comprising its southern half.')


In [38]:
query = "Korean war" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  Korean war 
Answer : 
('The Korean War (also known by other names) was fought between North Korea '
 'and South Korea from 1950 to 1953. The war began on 25 June 1950 when North '
 'Korea invaded South Korea following clashes along the border and rebellions '
 'in South Korea. North Korea was supported by China and the Soviet Union '
 'while South Korea was supported by the United States and allied countries.')


In [42]:
query = "Alexander the Great" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  Alexander the Great 
Answer : 
('Alexander III of Macedon (Ancient Greek: Ἀλέξανδρος, romanized: Alexandros; '
 '20/21 July 356 BC – 10/11 June 323 BC), commonly known as Alexander the '
 'Great, was a king of the ancient Greek kingdom of Macedon. He succeeded his '
 'father Philip II to the throne in 336 BC at the age of 20, and spent most of '
 'his ruling years conducting a lengthy military campaign throughout Western '
 'Asia and Egypt.')


In [53]:
query = "Mughal Empire" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  Mughal Empire 
Answer : 
('The Mughal Empire was an early-modern Islamic empire that controlled much of '
 'South Asia between the 16th and 19th centuries. For some two hundred years, '
 'the empire stretched from the outer fringes of the Indus river basin in the '
 'west, northern Afghanistan in the northwest, and Kashmir in the north, to '
 'the highlands of present-day Assam and Bangladesh in the east, and the '
 'uplands of the Deccan Plateau in South India.')


In [54]:
query = "How did Ottoman empire end?" 
relevant_docs = retriever.retrieve(query=query, top_k=15)
answer = get_my_answer(query, relevant_docs)
print("Question : ",query, "\nAnswer : ")
pp.pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question :  How did Ottoman empire end? 
Answer : 
('The empire came to an end in the aftermath of its defeat in World War I, '
 'when its remaining territory was partitioned by the Allies. The sultanate '
 'was officially abolished by the Government of the Turkish Grand National '
 'Assembly in Ankara on 1 November 1922 following the Turkish War of '
 'Independence. Throughout its more than 600 years of existence, the Ottoman '
 'Empire has left a profound legacy in the Middle East and Southeast Europe, '
 'as can be seen in the customs, culture, and cuisine of the various countries '
 'that were once part of its realm.')


#Saving embedding files to drive

Below cells were used to zip and copy the files from colab to google drive so we don't have to create the embeddings evertime and can just use these embedding files from google drive.

In [14]:
!zip -r /content/faiss.zip /content/faiss/

  adding: content/faiss/ (stored 0%)
  adding: content/faiss/faiss_index_file (deflated 7%)
  adding: content/faiss/faiss_index_file.json (deflated 15%)
  adding: content/faiss/faiss_document_store.db (deflated 73%)


In [15]:
import shutil
shutil.copy("./faiss.zip", "/content/drive/MyDrive/ColabData/nlp_project/faiss.zip")

'/content/drive/MyDrive/ColabData/nlp_project/faiss.zip'