In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
paragraph = pd.read_excel('/content/paragraphs.xlsx')

Preprocessing the data

In [4]:
paragraph.columns = ['index','word','para']
paragraph

Unnamed: 0,index,word,para
0,25912.0,Zhejiang,"Zhejiang (help·info), formerly romanized as C..."
1,25913.0,Zhejiang,"Zhejiang (help·info), formerly romanized as C..."
2,25914.0,Zhejiang,"Zhejiang (help·info), formerly romanized as C..."
3,25915.0,Zhejiang,"Zhejiang (help·info), formerly romanized as C..."
4,25916.0,Zhejiang,"Zhejiang (help·info), formerly romanized as C..."
...,...,...,...
75050,7437.0,2008_Summer_Olympics_torch_relay,Two additional teams of 40 attendants each wil...
75051,1059.0,,Although the two displayed great respect and a...
75052,3509.0,,"On August 24, 1673, Dutch captain Anthonio Col..."
75053,78743.0,,Eton has a long list of distinguished former p...


In [5]:
list_of_words = paragraph['word'].unique()

In [6]:
list_of_words

array(['Zhejiang', 'YouTube', 'Yale_University', 'Xbox_360', 'Wood',
       'Windows_8', 'Westminster_Abbey', 'War_on_Terror', 'Virgil',
       'Videoconferencing', 'Valencia', 'Vacuum', 'USB', 'Uranium',
       'University_of_Notre_Dame', 'University_of_Kansas', 'University',
       'Universal_Studios', 'United_States_presidential_election,_2004',
       'United_States_Army', 'United_States_Air_Force',
       'United_Nations_Population_Fund', 'Unicode', 'Umayyad_Caliphate',
       'Tuvalu', 'Turner_Classic_Movies', 'Tucson,_Arizona',
       'Tristan_da_Cunha', 'Treaty', 'Translation',
       'To_Kill_a_Mockingbird', 'Time', 'Thuringia', 'Theme',
       'The_Sun_(United_Kingdom)',
       'The_Legend_of_Zelda:_Twilight_Princess', 'The_Bronx', 'The_Blitz',
       'Textual_criticism', 'Tennessee', 'Tajikistan', 'Szlachta',
       'Symbiosis', 'Switzerland', 'Swaziland', 'Supreme_court',
       'Super_Nintendo_Entertainment_System', 'Sumer', 'Strasbourg',
       'Steven_Spielberg', 'States

In [7]:
# Function that removes special characters from the text ie all characters except numbers and alphabets.
def replace_special_characters_with_space(x):
    return re.sub(r'[^a-zA-Z0-9]', ' ', str(x))

# Applying the function to the word column
paragraph['word'] = paragraph['word'].map(replace_special_characters_with_space)

In [8]:
replace_special_characters_with_space("New_York_City") # Testing the function

'New York City'

In [9]:
paragraph[:][paragraph['para'].isna() + paragraph['word'].isna()] # Checking for missing values

Unnamed: 0,index,word,para
27611,3523.0,New York City,
28030,4125.0,New York City,
33688,130301.0,Matter,
38083,4800.0,Kanye West,
50494,1388.0,Fr d ric Chopin,


In [10]:
paragraph_cleaned = paragraph.dropna()

In [11]:
paragraph_cleaned[:][paragraph_cleaned['para'].isna() + paragraph_cleaned['word'].isna()]

Unnamed: 0,index,word,para


In [12]:
paragraph_cleaned['word'].value_counts()

word
Queen Victoria                                    607
New York City                                     560
American Idol                                     525
Beyonc                                            523
Fr d ric Chopin                                   488
                                                 ... 
Animal                                             53
Letter case                                        42
Race and ethnicity in the United States Census     38
Pitch  music                                       22
nan                                                 4
Name: count, Length: 362, dtype: int64

In [13]:
paragraph_cleaned.loc[:,'word'] = paragraph_cleaned['word'].str.lower() # converting all words to lower case

In [14]:
paragraph_cleaned['word'].value_counts()

word
queen victoria                                    607
new york city                                     560
american idol                                     525
beyonc                                            523
fr d ric chopin                                   488
                                                 ... 
animal                                             53
letter case                                        42
race and ethnicity in the united states census     38
pitch  music                                       22
nan                                                 4
Name: count, Length: 362, dtype: int64

In [17]:
vectorizer = TfidfVectorizer( stop_words='english') # Creating a TfidfVectorizer object

In [18]:
document = paragraph_cleaned['para'].tolist() # Converting the paragraph column to a list

In [20]:
vectorizer.fit(document)

In [21]:
tfidf_vectors = vectorizer.transform(document) # Transforming the paragraphs to vectors

In [22]:
tfidf_vectors_word = vectorizer.transform(paragraph_cleaned['word']) # Transforming the words to vectors

In [23]:
feature_names = vectorizer.get_feature_names_out() # Getting the feature names

In [24]:
feature_names

array(['00', '000', '0000', ..., 'ﬁnding', 'ﬁve', '𐀞𐀊𐀍𐀚'], dtype=object)

Defining the working of Retriever

In [164]:
def Retriever(query):
    transformed_query = vectorizer.transform(query)
    transformed_query_array = transformed_query.toarray().squeeze()
    sorted_transformed_query = np.sort(transformed_query_array)
    similarity = list()
    indexes = list()
    x = -1
    while sorted_transformed_query[x] != 0:
        word = feature_names[np.where(transformed_query_array==sorted_transformed_query[x])[0][0]]

        index= paragraph_cleaned.index[paragraph_cleaned['word'].str.contains(word)].tolist()
        indexes.extend(index)
        for y in index:
            cosine_sim = cosine_similarity(tfidf_vectors[y], transformed_query)
            similarity.append(float(cosine_sim))

        x -=1
    return similarity,indexes


Function that will preprocess the input according to T5 transformer

In [166]:
def preprocess_context_question(context, question):
  # Concatenate retrieved passages and question with a delimiter (e.g., "[SEP]")
  combined_text = context + " [SEP] " + question
  # Tokenize the text using the T5 tokenizer
  input_ids = tokenizer.encode(combined_text, return_tensors="pt")
  return input_ids


In [178]:
# Takes question as the input , searches the context in paragraph_clean and formats the output according to input of T5 transformer
# Returns none if the question cannot be answered
def Query(question):
  similarity,indexes = Retriever([question])
  dict1 = {'similarity':similarity,'indexes':indexes}
  SimIndex = pd.DataFrame(dict1)
  SimIndex.drop_duplicates(inplace=True)
  SimIndex.sort_values(by = 'similarity',inplace= True,ascending =False)
  if len(SimIndex['indexes'])<5:
      return None
  index_of_context = list(SimIndex['indexes'][:5])
  context  = ""
  for x in index_of_context:
      context+=paragraph_cleaned['para'][x]
  print("Context: ",context)
  return preprocess_context_question(context, question)

In [176]:
# Load pre-trained T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [183]:
def answer_using_t5(question):

  # Preprocess context and question
  input_ids = Query(question)
  if input_ids ==None:
    return "Sorry, cannot give answer, insufficient information"
  # Generate the answer using the model
  output = model.generate(input_ids)

  # Decode the generated answer tokens back to text
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
  return answer
question = 'What is the native language of punjab'
answer = answer_using_t5(question)
print("Question:", question)
print("Answer:", answer) # Answer is "Sorry, cannot give answer, insufficient information" if the model cannot answer the question.

  similarity.append(float(cosine_sim))


Context:  The major and native language spoken in the Punjab is Punjabi (which is written in a Shahmukhi script in Pakistan) and Punjabis comprise the largest ethnic group in country. Punjabi is the provincial language of Punjab. There is not a single district in the province where Punjabi language is mother-tongue of less than 89% of population. The language is not given any official recognition in the Constitution of Pakistan at the national level. Punjabis themselves are a heterogeneous group comprising different tribes, clans (Urdu: برادری‎) and communities. In Pakistani Punjab these tribes have more to do with traditional occupations such as blacksmiths or artisans as opposed to rigid social stratifications. Punjabi dialects spoken in the province include Majhi (Standard), Saraiki and Hindko. Saraiki is mostly spoken in south Punjab, and Pashto, spoken in some parts of north west Punjab, especially in Attock District and Mianwali District.The Government of Punjab is a provincial g