# **Step-1: Build some functions for cleaning the input text file.**

## Import and install all dependencies

In [1]:
\import numpy as np
import pandas as pd
import string
import sys  
!{sys.executable} -m pip install contractions
import contractions # Used for splitting words like this --> (isn't -> is not,  I'll -> I will ... etc)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 41.2 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 56.8 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24


In [2]:
!pip install git+https://github.com/boudinfl/pke.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-hercre_5
  Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-hercre_5
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 30.7 MB/s 
Building wheels for collected packages: pke, sklearn
  Building wheel for pke (setup.py) ... [?25l[?25hdone
  Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160276 sha256=274c6071c8882680a04e58e7f86836e656af7b0a6d92a2096b5a8b857b580445
  Stored in directory: /tmp/pip-ephem-wheel-cache-x4as64nr/wheels/fa/b3/09/612ee93bf3ee4164bcd5783e742942cdfc892a86039d3e0a33
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filenam

In [3]:
# download the english model
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 12.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Prepare the cleaning functions

In [4]:
def convert_file_to_list_of_string(file_path):
  """ This function return a text file as list of lines.

  Input: The path of the text file. 
  """
  text_list = []
  with open(file_path) as file:
    for line in file:
      text_list.append(line)
  return text_list


def remove_blanks(text_list):
  """ This function return a text file as a list of
  cleaned file from any blank lines

  Input: List of lines which the text file consists of.  
  """
  # remove new lines
  for i in range(len(text_list)):
    text_list[i] = text_list[i].strip()
        
  # remove any new blank line appears
  for line in text_list:
      if line == '' :
          text_list.remove(line)
        
  return text_list



def remove_puncs(text_list):
  """ This function return a text file as a list of
  cleaned file from any punctuations expect these (".,')
  
  Input: List of lines which the text file consists of.
  """
  # remove un-needed punctuations
  puncs = string.punctuation
  for i in range(len(text_list)):
      for c in text_list[i] :
          if (c in puncs) and (c not in ".,'"):
              text_list[i] = text_list[i].replace(c, '')
  return text_list


def remove_nums(text_list):
    import re
    for i in range(len(text_list)):
        text_list[i] = re.sub(r"[0-9]", "", text_list[i])
    return text_list


def remove_arabic_chars(text_list):
    import re
    for i in range(len(text_list)):
        text_list[i] = re.sub(r"[أ-ي]", "", text_list[i])
    return text_list


def email_remover(text_list):
    for line in text_list:
        if ('.com' in line) or ('yahoo' in line) or ('gmail' in line) or ('hotmail' in line):
            text_list.remove(line)
    return text_list


def get_cleaned_textfile(text_list):
    return "".join(text_list)


def contraction(text):
    return contractions.fix(text)

## Cleaning pipeline (Combine all the prevoius cleaning functions in one another function **`cleaning_text_pipeline`**

In [5]:
def cleaning_text_pipeline():
  file_path = input("Enter the file path please: ")

  # Get a list of the file lines where each elemnt in this list is a line from original text file.
  convert_file_to_list_of_string(file_path)

  # Remove any blank lines (new lines) from the text file.
  text_list = convert_file_to_list_of_string(file_path)
  remove_blanks(text_list)

  # Remove the un-needed punctuations from the text file.
  text_list = remove_blanks(text_list)
  remove_puncs(text_list)

  # Remove the numeric alphabet from our text file.
  text_list = remove_puncs(text_list)
  remove_nums(text_list)

  # Remove any email from the text file.
  text_list = remove_nums(text_list)
  email_remover(text_list)

  # Remove all Arabic characters from the text file
  text_list = email_remover(text_list)
  remove_arabic_chars(text_list)

  # Get your cleand text file in the end ^_^.
  text_list = remove_arabic_chars(text_list)
  text = get_cleaned_textfile(text_list)

  # Apply contractions on the final cleaned text
  cleaned_text = contraction(text)

  return cleaned_text

## Call the pipeline function to apply the cleaning functions on the text file.



### First make sure that you in the right directory so you can access the file you want.

In [1]:
!ls

 nlp.txt   sample_data	'Skill_BOK (1).txt'


In [9]:
cleaned_text = cleaning_text_pipeline()
cleaned_text

Enter the file path please: nlp.txt




## Save the new clead text as `cleaned_file.txt`

In [10]:
with open("cleaned_file.txt", "w") as text_file:
  text_file.write(cleaned_text)

# **Step-2: Key-Phrase Extraction using PKE**

## Import and download all the dependencies

In [11]:
import pke

In [13]:
! pip install git+https://github.com/boudinfl/pke.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-5hbtk2hf
  Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-5hbtk2hf


In [14]:
# download the english model
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 32.6 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Graph-based models (Unsupervised)
- TopicRank
- PositionRank
- TextRank
- SingleRank

In [15]:
# This dictionary to save the outputs of each model (Model name as a key,  Model result(output) as a value of that key)
models_result = {}

In [16]:
# initialize keyphrase extraction model, here TopicRank
extractor = pke.unsupervised.TopicRank()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleaned_text, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
TopicRank_keyphrases = extractor.get_n_best(n=10)

print(TopicRank_keyphrases)

models_result["TopicRank"] = TopicRank_keyphrases

[('digital images', 0.04599044170364394), ('computer vision', 0.04567273712019357), ('processing', 0.01951920555329544), ('highdimensional data', 0.018359624498325972), ('object detection', 0.01725226633436607), ('example', 0.01584093629375564), ('interdisciplinary scientific field', 0.012597091916390666), ('symbolic information', 0.012391077888434228), ('multiple cameras', 0.01108270825711632), ('applications', 0.010476031070117561)]


In [17]:
# initialize keyphrase extraction model, here PositionRank
extractor = pke.unsupervised.PositionRank()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleaned_text, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
PositionRank_keyphrases = extractor.get_n_best(n=10)

print(PositionRank_keyphrases)

models_result["PositionRank"] = PositionRank_keyphrases

[('computer vision systems', 0.14429127559916083), ('computer vision method', 0.1370419878306412), ('computer vision tasks', 0.13694332318675898), ('medical computer vision', 0.13293092518049246), ('computer vision research', 0.13186836539204275), ('computer vision algorithms', 0.13099006576023844), ('computer stereo vision', 0.1289175754129693), ('general computer vision', 0.1286285238605553), ('computer vision', 0.12740220823237713), ('computer system', 0.0887109424368903)]


In [18]:
# initialize keyphrase extraction model, here TextRank
extractor = pke.unsupervised.TextRank()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleaned_text, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
TextRank_keyphrases = extractor.get_n_best(n=10)

print(TextRank_keyphrases)


models_result["TextRank"] = TextRank_keyphrases

[('multiple d images', 0.045881541578231363), ('medical image processing', 0.04353947866784049), ('ordinary d image', 0.042557243015399204), ('digital image processing', 0.04181004937980934), ('d images', 0.04172101062042049), ('image formation process', 0.040159868436554974), ('image processing', 0.03880691204805195), ('image primitives such', 0.038155026705818135), ('image coordinate system', 0.03736858833681188), ('most imaging systems', 0.036816551088855516)]


In [19]:
# initialize keyphrase extraction model, here SingleRank
extractor = pke.unsupervised.SingleRank()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleaned_text, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
SingleRank_keyphrases = extractor.get_n_best(n=10)

print(SingleRank_keyphrases)


models_result["SingleRank"] = SingleRank_keyphrases

[('many computer vision systems', 0.07162877502958037), ('several benchmark computer vision data sets', 0.0705102595229353), ('most computer vision systems', 0.06736149274866218), ('computer vision systems', 0.06591552663020925), ('medical image processing', 0.06182906179559852), ('typical computer vision tasks', 0.0605376525459545), ('computer vision method', 0.060504340852790384), ('digital image processing', 0.06023714628563341), ('computer vision tasks', 0.059286676637096955), ('image formation process', 0.05818507948557325)]


# Statistical models
- FirstPhrases
- TfIdf

In [20]:
extractor = pke.unsupervised.FirstPhrases()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleaned_text, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
FirstPhrases_keyphrases = extractor.get_n_best(n=10)

print(FirstPhrases_keyphrases)

models_result["FirstPhrases"] = FirstPhrases_keyphrases

[('computer vision', 0), ('interdisciplinary scientific field', -4), ('computers', -11), ('highlevel understanding', -14), ('digital images', -17), ('videos', -20), ('perspective', -24), ('engineering', -26), ('automate tasks', -33), ('human visual system', -37)]


In [21]:
extractor = pke.unsupervised.TfIdf()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleaned_text, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
TfIdf_keyphrases = extractor.get_n_best(n=10)

print(TfIdf_keyphrases)

models_result["TfIdf"] = TfIdf_keyphrases



[('computer vision', 430.7945454008961), ('images', 348.44895840452176), ('vision', 332.31411676475716), ('image data', 105.05845453025388), ('scene', 93.23836362026881), ('vision systems', 78.97899999016428), ('machine vision', 71.79909090014934), ('cameras', 68.16972950573458), ('robots', 56.13977724001671), ('vehicle', 52.12979315144409)]


## **From the previous models outputs i notice that the best results are --> `TopicRank` and `TFIDF`**

## Let's retreive the best keywords using count function

In [22]:
models_result

{'TopicRank': [('digital images', 0.04599044170364394),
  ('computer vision', 0.04567273712019357),
  ('processing', 0.01951920555329544),
  ('highdimensional data', 0.018359624498325972),
  ('object detection', 0.01725226633436607),
  ('example', 0.01584093629375564),
  ('interdisciplinary scientific field', 0.012597091916390666),
  ('symbolic information', 0.012391077888434228),
  ('multiple cameras', 0.01108270825711632),
  ('applications', 0.010476031070117561)],
 'PositionRank': [('computer vision systems', 0.14429127559916083),
  ('computer vision method', 0.1370419878306412),
  ('computer vision tasks', 0.13694332318675898),
  ('medical computer vision', 0.13293092518049246),
  ('computer vision research', 0.13186836539204275),
  ('computer vision algorithms', 0.13099006576023844),
  ('computer stereo vision', 0.1289175754129693),
  ('general computer vision', 0.1286285238605553),
  ('computer vision', 0.12740220823237713),
  ('computer system', 0.0887109424368903)],
 'TextRank'

### Make a list of all the models output keywords by iterating over the previous dictionary `models_results`

In [23]:
res_list = []
for model, res in models_result.items():
  for phrase in res:
    res_list.append(phrase[0]) # Take the first item only (Keyword)

## Each model results was **10** keyward. And I used **6** models so the length of the keywords list which i named `res_list` must be (6*10) = 60 keywords.

In [24]:
len(res_list)

60

## Build a dictionary that contains:
- Key: Each keyword outs from each model resluts.
- Value: Number of appears of it in all models results.

In [25]:
counts = {}
for result in res_list:
  counts[result] = res_list.count(result)

counts

{'digital images': 2,
 'computer vision': 4,
 'processing': 1,
 'highdimensional data': 1,
 'object detection': 1,
 'example': 1,
 'interdisciplinary scientific field': 2,
 'symbolic information': 1,
 'multiple cameras': 1,
 'applications': 1,
 'computer vision systems': 2,
 'computer vision method': 2,
 'computer vision tasks': 2,
 'medical computer vision': 1,
 'computer vision research': 1,
 'computer vision algorithms': 1,
 'computer stereo vision': 1,
 'general computer vision': 1,
 'computer system': 1,
 'multiple d images': 1,
 'medical image processing': 2,
 'ordinary d image': 1,
 'digital image processing': 2,
 'd images': 1,
 'image formation process': 2,
 'image processing': 1,
 'image primitives such': 1,
 'image coordinate system': 1,
 'most imaging systems': 1,
 'many computer vision systems': 1,
 'several benchmark computer vision data sets': 1,
 'most computer vision systems': 1,
 'typical computer vision tasks': 1,
 'computers': 1,
 'highlevel understanding': 1,
 'vid

## Filter the best keywords only which appears more than the others (>=2)

In [26]:
high_counts = {}
for (key, value) in counts.items():
  if value >= 2 :
    high_counts[key] = value
high_counts

{'digital images': 2,
 'computer vision': 4,
 'interdisciplinary scientific field': 2,
 'computer vision systems': 2,
 'computer vision method': 2,
 'computer vision tasks': 2,
 'medical image processing': 2,
 'digital image processing': 2,
 'image formation process': 2}



---



---



---



---



# **Step-3: Retrieve all the sentences that each keyword appeares in to build a `context` for each keyword**

## Import and download all the dependencies

In [27]:
! pip install flashtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9309 sha256=a710bf3a25a2e5665ede275788c21a437a5e168547b82041f4b7f60346f501c3
  Stored in directory: /root/.cache/pip/wheels/cb/19/58/4e8fdd0009a7f89dbce3c18fff2e0d0fa201d5cdfd16f113b7
Successfully built flashtext
Installing collected packages: flashtext
Successfully installed flashtext-2.7


In [28]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Mapping each keyword with its sentences

In [29]:
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

sentences = tokenize_sentences(cleaned_text)
filtered_keys = counts.keys()
keyword_sentence_mapping = get_sentences_for_keyword(filtered_keys, sentences)
        
print (keyword_sentence_mapping)

{'digital images': ['The technological discipline of computer vision seeks to apply its theories and models to the construction of computer vision systems.Subdomains of computer vision include scene reconstruction, object detection, event detection, video tracking, object recognition, D pose estimation, learning, indexing, motion estimation, visual servoing, D scene modeling, and image restoration.Computer vision is an interdisciplinary field that deals with how computers can be made to gain highlevel understanding from digital images or videos.', 'From the perspective of engineering, it seeks to understand and automate tasks that the human visual system can do.Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images, and extraction of highdimensional data from the real world in order to produce numerical or symbolic information, e.g.', 'Some examples of typical computer vision tasks are presented below.Computer vision tasks include me

## **Check**: Visualize the output sentences of the keyword **`computer vision`**

In [30]:
keyword_sentence_mapping['computer vision']

['The content can be specified in different ways, for example in terms of similarity relative a target image give me all images similar to image X by utilizing reverse image search techniques, or in terms of highlevel search criteria given as text input give me all images which contain many houses, are taken during winter, and have no cars in them.Computer vision for people counter purposes in public places, malls, shopping centresPose estimation – estimating the position or orientation of a specific object relative to the camera.',
 'The technological discipline of computer vision seeks to apply its theories and models to the construction of computer vision systems.Subdomains of computer vision include scene reconstruction, object detection, event detection, video tracking, object recognition, D pose estimation, learning, indexing, motion estimation, visual servoing, D scene modeling, and image restoration.Computer vision is an interdisciplinary field that deals with how computers can



---



---



---



---



# **Step-4: Question Generator example**

## Import and download all dependencies

In [16]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [17]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [18]:
!git clone https://github.com/amontgomerie/question_generator/

fatal: destination path 'question_generator' already exists and is not an empty directory.


In [19]:
%cd question_generator/
%load questiongenerator.py
from questiongenerator import QuestionGenerator
from questiongenerator import print_qa

/content/question_generator


In [20]:
import torch

In [21]:
qg = QuestionGenerator()

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [None]:
with open('articles/indian_matchmaking.txt', 'r') as a:
    article = a.read()

In [None]:
qa_list = qg.generate(
    article, 
    num_questions=10, 
    answer_style='all'
)
print_qa(qa_list)

Generating questions...





Evaluating QA pairs...

1) Q: What would have been offended if Sima Aunty was woke?
   A: In fact, I would have been offended if Sima Aunty was woke and spoke about choice, body positivity and clean energy during matchmaking.

2) Q: How many websites have joined the hunt?
   A: Over the years, thousands of professional matchmakers and hundreds of matrimonial websites have joined the hunt.

3) Q: What does she say about women?
   A: She also regularly comments on their appearance, including one instance where she describes a woman as "not photogenic".

4) Q: How many items of clothing does she have?
   A: Ms Taparia, who's in her 50s and like a genial "aunty" to her clients, takes us through living rooms that resemble lobbies of posh hotels and custom-made closets filled with dozens of shoes and hundreds of items of clothing.

5) Q: What is the role of the family priests in Indian Matchmaking?
   A: Traditionally, matchmaking has been the job of family priests, relatives and neighbourho

# **Instead of applying all the previous steps on text file that manually collected, Now we will scrap a `Wikipedia` article and apply all the prevoius steps again on it.**

## Use Wikipedia module in python to scrap some text

In [7]:
!pip install wikipedia-api

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import wikipediaapi

## Search on Wikipedia for "Machine Learning" page title, Then scrap its text

In [9]:
wiki_wiki = wikipediaapi.Wikipedia('en')
page_py = wiki_wiki.page('Machine learning')

In [10]:
new_txt = page_py.text

%cd ..

/content


In [11]:
!ls

cleaned_file.txt  nlp.txt	       question_generator
ml.txt		  question_generation  sample_data


### Save the scrapped page text as 'ml.txt'

In [12]:
with open("ml.txt", "w") as text_file:
  text_file.write(new_txt)

## **Step-1 again: Build some functions for cleaning the input text file**

In [2]:
def convert_file_to_list_of_string(file_path):
  """ This function return a text file as list of lines.

  Input: The path of the text file. 
  """
  text_list = []
  with open(file_path) as file:
    for line in file:
      text_list.append(line)
  return text_list


def remove_blanks(text_list):
  """ This function return a text file as a list of
  cleaned file from any blank lines

  Input: List of lines which the text file consists of.  
  """
  # remove new lines
  for i in range(len(text_list)):
    text_list[i] = text_list[i].strip()
        
  # remove any new blank line appears
  for line in text_list:
      if line == '' :
          text_list.remove(line)
        
  return text_list



def remove_puncs(text_list):
  """ This function return a text file as a list of
  cleaned file from any punctuations expect these (".,')
  
  Input: List of lines which the text file consists of.
  """
  # remove un-needed punctuations
  import string
  puncs = string.punctuation
  for i in range(len(text_list)):
      for c in text_list[i] :
          if (c in puncs) and (c not in ".,'"):
              text_list[i] = text_list[i].replace(c, '')
  return text_list


def remove_nums(text_list):
    import re
    for i in range(len(text_list)):
        text_list[i] = re.sub(r"[0-9]", "", text_list[i])
    return text_list


def remove_arabic_chars(text_list):
    import re
    for i in range(len(text_list)):
        text_list[i] = re.sub(r"[أ-ي]", "", text_list[i])
    return text_list


def email_remover(text_list):
    for line in text_list:
        if ('.com' in line) or ('yahoo' in line) or ('gmail' in line) or ('hotmail' in line):
            text_list.remove(line)
    return text_list


def get_cleaned_textfile(text_list):
    return "".join(text_list)


def contraction(text):
  import contractions
  return contractions.fix(text)

## Use the previous cleaning_text_pipeline function that call all the cleaning functions together.

In [3]:
def cleaning_text_pipeline():
  file_path = input("Enter the file path please: ")

  # Get a list of the file lines where each elemnt in this list is a line from original text file.
  convert_file_to_list_of_string(file_path)

  # Remove any blank lines (new lines) from the text file.
  text_list = convert_file_to_list_of_string(file_path)
  remove_blanks(text_list)

  # Remove the un-needed punctuations from the text file.
  text_list = remove_blanks(text_list)
  remove_puncs(text_list)

  # Remove the numeric alphabet from our text file.
  text_list = remove_puncs(text_list)
  remove_nums(text_list)

  # Remove any email from the text file.
  text_list = remove_nums(text_list)
  email_remover(text_list)

  # Remove all Arabic characters from the text file
  text_list = email_remover(text_list)
  remove_arabic_chars(text_list)

  # Get your cleand text file in the end ^_^.
  text_list = remove_arabic_chars(text_list)
  text = get_cleaned_textfile(text_list)

  # Apply contractions on the final cleaned text
  cleaned_text = contraction(text)

  return cleaned_text

In [5]:
cleand_txt = cleaning_text_pipeline()

Enter the file path please: nlp.txt


In [6]:
cleand_txt



## **Step-2 again: Key-Phrase Extraction using PKE**

## But here I'll use **`TopicRank**` and **`TfIdf`** only.

In [7]:
import pke

### Retrieve the best keywords and its appearence sebtences again

In [8]:
models_result = {}

In [9]:
# initialize keyphrase extraction model, here TopicRank
extractor = pke.unsupervised.TopicRank()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleand_txt, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
TopicRank_keyphrases = extractor.get_n_best(n=10)

print(TopicRank_keyphrases)

models_result["TopicRank"] = TopicRank_keyphrases

models_result

[('digital images', 0.04599044170364394), ('computer vision', 0.04567273712019357), ('processing', 0.01951920555329544), ('highdimensional data', 0.018359624498325972), ('object detection', 0.01725226633436607), ('example', 0.01584093629375564), ('interdisciplinary scientific field', 0.012597091916390666), ('symbolic information', 0.012391077888434228), ('multiple cameras', 0.01108270825711632), ('applications', 0.010476031070117561)]


{'TopicRank': [('digital images', 0.04599044170364394),
  ('computer vision', 0.04567273712019357),
  ('processing', 0.01951920555329544),
  ('highdimensional data', 0.018359624498325972),
  ('object detection', 0.01725226633436607),
  ('example', 0.01584093629375564),
  ('interdisciplinary scientific field', 0.012597091916390666),
  ('symbolic information', 0.012391077888434228),
  ('multiple cameras', 0.01108270825711632),
  ('applications', 0.010476031070117561)]}

In [10]:
extractor = pke.unsupervised.TfIdf()

# load the content of the document, here document is expected to be a simple 
# test string and preprocessing is carried out using spacy
extractor.load_document(input=cleand_txt, language='en')

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor.candidate_weighting()

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
TfIdf_keyphrases = extractor.get_n_best(n=10)

print(TfIdf_keyphrases)

models_result["TfIdf"] = TfIdf_keyphrases

models_result



[('computer vision', 430.7945454008961), ('images', 348.44895840452176), ('vision', 332.31411676475716), ('image data', 105.05845453025388), ('scene', 93.23836362026881), ('vision systems', 78.97899999016428), ('machine vision', 71.79909090014934), ('cameras', 68.16972950573458), ('robots', 56.13977724001671), ('vehicle', 52.12979315144409)]


{'TopicRank': [('digital images', 0.04599044170364394),
  ('computer vision', 0.04567273712019357),
  ('processing', 0.01951920555329544),
  ('highdimensional data', 0.018359624498325972),
  ('object detection', 0.01725226633436607),
  ('example', 0.01584093629375564),
  ('interdisciplinary scientific field', 0.012597091916390666),
  ('symbolic information', 0.012391077888434228),
  ('multiple cameras', 0.01108270825711632),
  ('applications', 0.010476031070117561)],
 'TfIdf': [('computer vision', 430.7945454008961),
  ('images', 348.44895840452176),
  ('vision', 332.31411676475716),
  ('image data', 105.05845453025388),
  ('scene', 93.23836362026881),
  ('vision systems', 78.97899999016428),
  ('machine vision', 71.79909090014934),
  ('cameras', 68.16972950573458),
  ('robots', 56.13977724001671),
  ('vehicle', 52.12979315144409)]}

In [11]:
res_list = []
for model, res in models_result.items():
  for phrase in res:
    res_list.append(phrase[0])

len(res_list)

20

In [12]:
counts = {}
for result in res_list:
  counts[result] = res_list.count(result)

counts

{'digital images': 1,
 'computer vision': 2,
 'processing': 1,
 'highdimensional data': 1,
 'object detection': 1,
 'example': 1,
 'interdisciplinary scientific field': 1,
 'symbolic information': 1,
 'multiple cameras': 1,
 'applications': 1,
 'images': 1,
 'vision': 1,
 'image data': 1,
 'scene': 1,
 'vision systems': 1,
 'machine vision': 1,
 'cameras': 1,
 'robots': 1,
 'vehicle': 1}

In [13]:
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

sentences = tokenize_sentences(cleand_txt)
filtered_keys = counts.keys()
keyword_sentence_mapping = get_sentences_for_keyword(filtered_keys, sentences)
        
print (keyword_sentence_mapping)

{'digital images': ['The technological discipline of computer vision seeks to apply its theories and models to the construction of computer vision systems.Subdomains of computer vision include scene reconstruction, object detection, event detection, video tracking, object recognition, D pose estimation, learning, indexing, motion estimation, visual servoing, D scene modeling, and image restoration.Computer vision is an interdisciplinary field that deals with how computers can be made to gain highlevel understanding from digital images or videos.', 'From the perspective of engineering, it seeks to understand and automate tasks that the human visual system can do.Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images, and extraction of highdimensional data from the real world in order to produce numerical or symbolic information, e.g.', 'Some examples of typical computer vision tasks are presented below.Computer vision tasks include me

## This **`description_of_each_keyword`** will contain each keyword sentences join with each other to become a text(string) of each keyword sentences.

In [14]:
description_of_each_keyword = []

## **Step-3: Generate Questions**

In [22]:
questions = []

for key in keyword_sentence_mapping:
  sentences_list = keyword_sentence_mapping[key] # save each list of sentences for each keyword.
  article = "".join(sentences_list) # article --> contains all the sentences joined with each other to creat a context.
  description_of_each_keyword.append(article)
  article_text = "<answer>" + key + "<x\context>" + article  # prepare input for the question generation model.
  qa_list = qg.generate(article_text, num_questions=5, answer_style='all')
  questions.append(qa_list) # append all questions generated in (questions) list.

Generating questions...





Evaluating QA pairs...



Token indices sequence length is longer than the specified maximum sequence length for this model (3515 > 512). Running this sequence through the model will result in indexing errors


Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

('\nWas only able to generate 3 questions.', 'For more questions, please input a longer text.')
Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

('\nWas only able to generate 1 questions.', 'For more questions, please input a longer text.')
Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

('\nWas only able to generate 2 questions.', 'For more questions, please input a longer text.')
Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

('\nWas only able to gener

In [23]:
ques_per_keyword = [] # list of lists (each keyword unique questions)
for keyword_ques_list in questions:
  lst = []
  for dict_quest_and_ans in keyword_ques_list:
    lst.append(dict_quest_and_ans["question"])
  s = set(lst) # Remove any duplicated questions.
  unique_list = list(s)
  ques_per_keyword.append(unique_list)
  lst.clear()

## Now take only the first question for each context so you can build a DataFrame

In [24]:
first_question = []
for q_lst in ques_per_keyword:
  first_question.append(q_lst[0])

# Make a DataFrame 

In [25]:
# import pandas library
import pandas as pd
  
# dictionary with list object in values
details = {
    'Keyword' : res_list[:19],
    'Context' : description_of_each_keyword[:19],
    'Question' : first_question,
}
  
# creating a Dataframe object 
df = pd.DataFrame(details)

In [26]:
df.head()

Unnamed: 0,Keyword,Context,Question
0,digital images,The technological discipline of computer visio...,What are the types of computer vision tasks?
1,computer vision,The technological discipline of computer visio...,What is the definition of computer vision?
2,processing,Recent advances in deep learning has enabled r...,What is the role of the input to a device for ...
3,highdimensional data,Recent advances in deep learning has enabled r...,What is the definition of computer vision?
4,object detection,"From the perspective of engineering, it seeks ...",What is the definition of computer vision?


In [28]:
df.to_csv(r'/content/context_ques.csv')



---



---



---



---




# **Question Answering**

## **Haystack Model**

### Import and download all dependencies

In [57]:
# Install the latest release of Haystack in your own environment
#! pip install farm-haystack

# Install the latest main of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 31.7 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farm-haystack[colab]
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-install-hy200r6h/farm-haystack_bed0fb3adff9460eb248972712eb6abd
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-install-hy200r6h/farm-haystack_bed0fb3adff9460eb248972712eb6abd
  Resolved https://github.com/deepset-ai/haystack.git to commit 938e6fda5b686ec49c52cb23f786a74d9321e048
  Installing build d

In [29]:
# Option 2: In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
# wait until ES has started
! sleep 30

In [30]:
from haystack.document_stores import ElasticsearchDocumentStore
docs = [{"content": cleand_txt}]

# Initialize document store and write in the documents
document_store = ElasticsearchDocumentStore()
document_store.write_documents(docs)

In [31]:
# An in-memory TfidfRetriever based on Pandas dataframes
from haystack.nodes import TfidfRetriever

retriever = TfidfRetriever(document_store=document_store)

In [32]:
from haystack.nodes import FARMReader


# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [33]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

### Haystack model output

In [38]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k for retriever, the better (but also the slower) your answers.
import pandas
data = pd.read_csv("context_ques.csv")
for question in data["Question"]:
  prediction = pipe.run(query= question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
  print(prediction)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.47 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.59 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.56 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.58 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.58 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.56 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.58 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.56 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.57 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.54 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.53 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.56 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.56 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.55 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.54 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.52 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.54 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.52 Batches/s]




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.53 Batches/s]






# This NLP Pipeline should be after T5-HuggingFace but due to restart run_time i moved it at the end of this Notebook

# Using Ready nlp Pipeline

## Import and download all the dependencies

In [60]:
!pip install -U transformers==3.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.0.0
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 31.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 58.4 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 65.2 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=6c954bd99c8a1273874e2e4258e52c488fefab27895fa2ce78eb2e534f1e435c
  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9
Successfully built sacremoses
Installing collected packages: 

In [55]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
!git clone https://github.com/patil-suraj/question_generation.git

Cloning into 'question_generation'...
remote: Enumerating objects: 268, done.[K
Receiving objects:   0% (1/268)   Receiving objects:   1% (3/268)   Receiving objects:   2% (6/268)   Receiving objects:   3% (9/268)   Receiving objects:   4% (11/268)   Receiving objects:   5% (14/268)   Receiving objects:   6% (17/268)   Receiving objects:   7% (19/268)   Receiving objects:   8% (22/268)   Receiving objects:   9% (25/268)   Receiving objects:  10% (27/268)   Receiving objects:  11% (30/268)   Receiving objects:  12% (33/268)   Receiving objects:  13% (35/268)   Receiving objects:  14% (38/268)   Receiving objects:  15% (41/268)   Receiving objects:  16% (43/268)   Receiving objects:  17% (46/268)   Receiving objects:  18% (49/268)   Receiving objects:  19% (51/268)   Receiving objects:  20% (54/268)   Receiving objects:  21% (57/268)   Receiving objects:  22% (59/268)   Receiving objects:  23% (62/268)   Receiving objects:  24% (65/268)   Receiving objects:  25%

In [57]:
%cd question_generation

/content/question_generator/question_generation


In [58]:
from pipelines import pipeline

In [59]:
nlp = pipeline("question-generation")

In [74]:
description_of_each_keyword[5]

'The technological discipline of computer vision seeks to apply its theories and models to the construction of computer vision systems.Subdomains of computer vision include scene reconstruction, object detection, event detection, video tracking, object recognition, D pose estimation, learning, indexing, motion estimation, visual servoing, D scene modeling, and image restoration.Computer vision is an interdisciplinary field that deals with how computers can be made to gain highlevel understanding from digital images or videos.'

### I use try & except to handle any error due to this nlp pipe line may can not generate questions so the program will stop ! SO i should handle this error.

In [79]:
for i in range(len(description_of_each_keyword)):
  try:
    print(nlp(description_of_each_keyword[i]))
    print()
    print("##################################################################################")
    print()
  except:
    print("Can Not Generate Questions From This Text Sentence !!!")
    print()
    print("##################################################################################")
    print()

Can Not Generate Questions From This Text Sentence !!!

##################################################################################

Can Not Generate Questions From This Text Sentence !!!

##################################################################################

[]

##################################################################################

[{'answer': 'deep learning', 'question': 'What has enabled researchers to build models that are able to generate and reconstruct D shapes from single or multiview depth maps or silhouettes seamlessly and efficiently?'}]

##################################################################################

[{'answer': 'engineering', 'question': 'From what perspective does a computer vision task seek to understand and automate tasks that the human visual system can do?'}]

##################################################################################

[{'answer': 'computer vision', 'question': 'What technological discipline 