#### Import libraries

In [2]:
import re
import random
import string

import nltk
import requests
import bs4 as bs

#### Creating the Corpus

In [3]:
response = requests.get('https://en.wikipedia.org/wiki/Pythagoras')
raw_html = response.text

article_html = bs.BeautifulSoup(raw_html, 'html')
article_paragraphs = article_html.find_all('p')

article_text = ''.join(p.text for p in article_paragraphs).lower()

In [4]:
print(article_text[:50])


pythagoras of samos[a] (c. 570 – c. 495 bc)[b] wa


#### Text Preprocessing


In [5]:
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)
print(article_text[:50])


 pythagoras of samos[a] (c. 570 – c. 495 bc)[b] wa


#### Divide to sentence


In [6]:
article_sentence = nltk.sent_tokenize(article_text)

for sentence in article_sentence[:3]:
    print(sentence)

 pythagoras of samos[a] (c. 570 – c. 495 bc)[b] was an ancient ionian greek philosopher and the eponymous founder of pythagoreanism.
his political and religious teachings were well known in magna graecia and influenced the philosophies of plato, aristotle, and, through them, western philosophy.
knowledge of his life is clouded by legend, but he appears to have been the son of mnesarchus, a gem-engraver on the island of samos.


#### Divide to words

In [7]:
article_words = nltk.word_tokenize(article_text)
print(article_words[:3])

['pythagoras', 'of', 'samos']


#### Lemmatization


In [8]:
wnlemmatizer = nltk.stem.WordNetLemmatizer()


def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]


punctuation_removal = {
    ord(punctuation): None
    for punctuation in string.punctuation
}


def get_processed_text(document: str):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))


In [9]:
print(punctuation_removal)


{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


#### Responding to Greetings


In [10]:
greeting_inputs =(
    'hey',
    'good morning',
    'good evening',
    'morning',
    'evening',
    'hi',
    'whatsup',
)

greeting_responses = [
    'hey',
    'hey hows you?',
    '*nods*',
    'hello, how you doing',
    'hello, how you doing',
    'hello',
    'you are welcome',
]

def generate_greeting_response(greeting: str) -> str:
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)


#### Responding to User Queries


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
def generate_response(user_input: str) -> str:
    sentences = [*article_sentence, user_input]

    word_vectorizer = TfidfVectorizer(
        tokenizer=get_processed_text,
        stop_words='english'
    )

    all_word_vectors = word_vectorizer.fit_transform(sentences)
    similarly_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similarly_vector_values.argsort()[0][-2]

    matched_vector = similarly_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        return "I'm sorry, I could not understand you."
    else:
        return article_sentence[similar_sentence_number]




In [13]:
generate_response('Who Pythagoras was?')



"diogenes laërtius states that the same theano was pythagoras's pupil and that pythagoras's wife theano was her daughter."