## imports

In [None]:
import nltk
import gensim
import numpy as np
import wikipediaapi
import os, sys, io, re
import gensim.downloader as api
from gensim.models.keyedvectors import KeyedVectors
nltk.download('punkt')

# **Loading Data**

In [5]:
# documents titles for five domains [football, operating systems, actors, car brands, diseases]
documents_titles = ["FC Barcelona", "real madrid", "Manchester United", "Arsenal", "windows", "linux", "Android operating system", "macos",
             "Tom Hanks", "Robert De Niro", "Tom Cruise", "Keanu Reeves", "mercedes benz", "BMW", "lamborghini", "ferrari",
             "allergies", "Conjunctivitis", "covid-19", "Influenza"]

In [6]:
# Downloading documents 
wiki = wikipediaapi.Wikipedia('en')

documents = []  # for saving all Documents 
for title in documents_titles:
  page = wiki.page(title)
  document = page.text[0:3500]  # geting the fisrt 3500 text of the page ~ 570 word
  documents.append(document)


## Functions

In [7]:
def normalization(document):
  '''
  Normalizaing the documet meanning that make all words lower-case,
   removing numbers, spaces and punctuations.  

  Parameters:
        (document): The document we want to normalize.
  Returns:
        str:  normalized string 
  ''' 
  
  lower_text = document.lower()   # lowercased string 
  no_number_text = re.sub(r'\d+','',lower_text)  # removing numbers
  no_punctuation_text = re.sub(r'[^\w\s]', '', no_number_text)  # removing punctuation
  no_spaces_text = no_punctuation_text.strip()  # remove leading and trailing spaces

  return no_spaces_text

In [8]:
def word_embedding(document, model):
  '''
  Creating a word embedding for each word in a documnet.

  Parameters:
        (document): The document that we want to create embeddings for it's words.
        (model): Pretrained model used to get the embeddings.
  Returns:
        list: a float list contain the word embedding vector for each word in document.      
  '''
  words_embeddings = []
   
  normalized_document = normalization(document)
  tokens = nltk.word_tokenize(normalized_document)
  
  for word in tokens:
    if word in model.vocab:
      words_embeddings.append(model[word])

  return words_embeddings

In [9]:
def document_embedding(words_embeddings):  
  '''
  Create an embedding representation for a document 
  by calculating the average of words embedding vectors in a document.

  Parameters:
        (words_embeddings): a list containing the word embedding vectors for all the words in the document.
  Returns:
        list: a float list contains therepresentation for the document.      
  '''
  doc_embedding = []

  # calculating average of words vectors in a document
  for i in range(len(words_embeddings[0])):   # len(words_embeddings of[*]) = any embedding vector lenght = 300 
    sum = 0
    for j in range(len(words_embeddings)):    # len(words_embeddings) = number of words in the document after normalization 
      sum += words_embeddings[j][i]
    doc_embedding.append(sum / len(words_embeddings))  
  
  return doc_embedding

In [36]:
def searching(user_input, documents, model):
  '''
  Performs the search operation for the users search sentence by generating 
  its embedding then calculate the euclidean distance between the sentence and all the documents embeddings.

  Parameters:
        (user_input): The search sentence that we want to perform the search opetarion for.
        (documents): All the documents that we heve in our search engine.
        (model): Pretrained model used to get the embedding of the user sentence.      
  '''
  input_embedding = np.array(document_embedding(word_embedding(user_input, model)))  # generating the enbedding of user sentence

  distances = []

  # Reading the embedding representation of each document that we calculated and save on files before
  # first we read the embedings as a string then we convert it to list of float values.
  # Then we calculate the euclidean distance between the sentence and each document embedding.
  number_of_documents = len(documents)
  
  for i in range(number_of_documents): 
    f = open("document %s .txt" % (str(i+1)), "r")
    string = f.read()
    string_list = string.strip('][').split(', ')
    doc_vector = np.array([float(element) for element in string_list])  # The embedding vector of the document

    # Calculate the euclidean distance between the sentence and the document embedding
    distances.append(np.linalg.norm(doc_vector - input_embedding))

  for i in range(number_of_documents):
    most_similar = min(distances)
    doc_index = distances.index(most_similar)  # get the index of the mininum distance(most_similar) 
    
    if most_similar < 1.85:  # 1.9 is distance threshold 
      del distances[doc_index]
      print(i, documents[doc_index][:500], "\n")  # Display the search results

# **Download The Model**

In [None]:
# Display all the pretrained models names in gensim
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
# changing the download directory
gensim.downloader.base_dir = "/content/drive/MyDrive"
gensim.downloader.base_dir

'/content/drive/MyDrive'

In [None]:
# Model downloading 
model = gensim.downloader.load('word2vec-google-news-300', return_path=True)



In [11]:
# Model loading 
model1 = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/word2vec-google-news-300/word2vec-google-news-300.gz', binary=True)

# **Training**

## Extracting words & documents embeddings 

In [12]:
# create a final embedding representation for each document
counter = 1
for document in documents:
  words_embeddings = word_embedding(document, model1)   # Extracting words embeddings vectors for each word in the document
  doc_embedding = document_embedding(words_embeddings)  # Embedding representation for the document using average 

  # Save the final representation for each document in a file with the name "document # .txt" 
  f = io.open("document %s .txt" % (str(counter)), "w")
  f.write(str(doc_embedding))  # saving the embedding of the document in a file 
  f.close()

  counter += 1


# **Testing**

## Perfoming search operation for Different inputs in different domains

In [37]:
user_input1 = "Mohamed salah's goal in Manchester united"
print("______First search results______")
searching(user_input1, documents, model1)

______First search results______
0 Manchester United Football Club is a professional football club based in Old Trafford, Greater Manchester, England, that competes in the Premier League, the top flight of English football. Nicknamed the Red Devils, the club was founded as Newton Heath LYR Football Club in 1878, but changed its name to Manchester United in 1902. The club moved from Newton Heath to its current stadium, Old Trafford, in 1910.
Manchester United have won the joint-record number of trophies in English club football, i 

1 Real Madrid Club de Fútbol (Spanish pronunciation: [reˈal maˈðɾið ˈkluβ ðe ˈfuðβol] (listen), meaning Royal Madrid Football Club), commonly referred to as Real Madrid or simply Real, is a Spanish professional football club based in Madrid.
Founded on 6 March 1902 as Madrid Football Club, the club has traditionally worn a white home kit since inception. The honorific title real is Spanish for "royal" and was bestowed to the club by King Alfonso XIII in 1920

In [38]:
user_input2 = "John Wick movie trailer"
print("______Second search results______")
searching(user_input2, documents, model1)

______Second search results______
0 Keanu Charles Reeves ( kee-AH-noo; born September 2, 1964) is a Canadian actor. Born in Beirut and raised in Toronto, Reeves began acting in theatre productions and in television films before making his feature film debut in Youngblood (1986). He had his breakthrough role in the science fiction comedy Bill & Ted's Excellent Adventure (1989), and he reprised his role in its sequels. He gained praise for playing a hustler in the independent drama My Own Private Idaho (1991), and established himsel 

1 Robert Anthony De Niro Jr. ( də NEER-oh, Italian: [de ˈniːro]; born August 17, 1943) is an American actor, producer, and director. He is particularly known for his nine collaborations with filmmaker Martin Scorsese, and is the recipient of various accolades, including two Academy Awards, a Golden Globe Award, the Cecil B. DeMille Award, and a Screen Actors Guild Life Achievement Award. In 2009, De Niro received the Kennedy Center Honor, and received a Pre

In [39]:
user_input3 = "covid-19 cases number"
print("______Third search results______")
searching(user_input3, documents, model1)

______Third search results______
0 Influenza, commonly known as "the flu", is an infectious disease caused by influenza viruses. Symptoms range from mild to severe and often include fever, runny nose, sore throat, muscle pain, headache, coughing, and fatigue. These symptoms begin from one to four days after exposure to the virus (typically two days) and last for about 2–8 days. Diarrhea and vomiting can occur, particularly in children. Influenza may progress to pneumonia, which can be caused by the virus or by a subsequent bacter 

1 Coronavirus disease 2019 (COVID-19) is a contagious disease caused by a virus, the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first known case was identified in Wuhan, China, in December 2019. The disease has since spread worldwide, leading to the ongoing COVID-19 pandemic.Symptoms of COVID‑19 are variable, but often include fever, cough, headache, fatigue, breathing difficulties, loss of smell, and loss of taste. Symptoms may begin 