In [None]:
from sklearn.feature_extraction.text import CountVectorizer
X = ("Computers can analyze text",
 "They do it using vectors and matrices",
 "Computers can process massive amounts of text data")
vectorizer=CountVectorizer(stop_words='english')
X_vectorized=vectorizer.fit_transform(X)

In [None]:
print(vectorizer.vocabulary_)


{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}


In [None]:
print(X_vectorized.todense())

[[0 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


Exploring the bag of words architecture

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
sentences = ["We are reading about Natural Language Processing Here",
"Natural Language Processing making computers comprehend language data",
"The field of Natural Language Processing is evolving everyday"]

In [None]:
corpus=pd.Series(sentences)
corpus


0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

In [None]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)

    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process

    Output : Returns the cleaned text corpus

    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [None]:
def stem(corpus, stem_type = None):
    if stem_type=='snowball':
      stemmer=SnowballStemmer(lanaguage='english')
      corpus=[[stemmer.stem(x) for x in x ] for x in corpus]
    else:
      stemmer=PorterStemmer()
      corpus=[[stemmer.stem(x) for x in x] for x in corpus]


In [None]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)

    Input :
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer

    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together

    Output : Returns the processed text corpus

    '''

    if cleaning == True:
        corpus = text_clean(corpus, keep_list)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]

    return corpus

In [None]:
common_dot_words = ['U.S.', 'Mr.', 'Mrs.', 'D.C.']

In [None]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  cleaned_corpus = pd.Series()
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

Building the vocabulary


In [None]:
set_of_words = set()
for sentence in preprocessed_corpus:
    for word in sentence.split():
        set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['language', 'natural', 'computers', 'read', 'process', 'everyday', 'field', 'evolve', 'comprehend', 'data', 'make']


In [None]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'language': 0, 'natural': 1, 'computers': 2, 'read': 3, 'process': 4, 'everyday': 5, 'field': 6, 'evolve': 7, 'comprehend': 8, 'data': 9, 'make': 10}


Creating a bow matrix

In [None]:
bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab)))

In [None]:
for i, preprocessed_sentence in enumerate(preprocessed_corpus):
    for token in preprocessed_sentence.split():
        bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1

In [None]:
bow_matrix

array([[1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [2., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1.],
       [1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0.]])

Using Countvectorizer for all the above process

In [None]:
vectorizer=CountVectorizer()
bow_matrix=vectorizer.fit_transform(preprocessed_corpus)

In [None]:
bow_matrix

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [None]:
print(vectorizer.get_feature_names_out())
bow_matrix.toarray()


['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']


array([[0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1],
       [1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0],
       [0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0]])

In [None]:
vectorizer_ngram_range = CountVectorizer(analyzer='word',ngram_range=(1,3))
bow_matrix_ngram =vectorizer_ngram_range.fit_transform(preprocessed_corpus)
print(vectorizer_ngram_range.get_feature_names_out())
print(bow_matrix_ngram.toarray())

['comprehend' 'comprehend language' 'comprehend language data' 'computers'
 'computers comprehend' 'computers comprehend language' 'data' 'everyday'
 'evolve' 'evolve everyday' 'field' 'field natural'
 'field natural language' 'language' 'language data' 'language process'
 'language process evolve' 'language process make' 'make' 'make computers'
 'make computers comprehend' 'natural' 'natural language'
 'natural language process' 'process' 'process evolve'
 'process evolve everyday' 'process make' 'process make computers' 'read'
 'read natural' 'read natural language']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 2 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]]


Max features in countvectorizer helps us to keep a cap on the number of features that can be used. Keep in mind that increasing the number of dimension can lead to overfitting which is also known as the curse of dimensionality

In [None]:
vectorizer_ngram_range = CountVectorizer(analyzer='word',ngram_range=(1,3),max_features=6)
bow_matrix_ngram =vectorizer_ngram_range.fit_transform(preprocessed_corpus)
print(vectorizer_ngram_range.get_feature_names_out())
print(bow_matrix_ngram.toarray())

['language' 'language process' 'natural' 'natural language'
 'natural language process' 'process']
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]


Using max_df and min_fdf which will ignore the number of words whose frequency is higher then the max_df as well has ignore the words that occurs in lesser amount then min_df


Issue with the bag of words approach


*   OOV problem
*   Cannot capture the semantic meaning behind the sentences



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)


In [None]:
print(vectorizer.get_feature_names_out())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)


['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']
[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)


Above code demonstrate the affect using L1 norm and L2 norm. The sum of all the mean absolute value of the vectors for that particular document should be 1 likewise the sum of all the means square value of the vector for the document should be 1 in L2 norm. By default tfidf uses L2 norm. While there is no difference in the vectorization process the value assigned to those tokens would be different for those two methods

While tf-idf is computationally fast and provides a way to assign weights to the words that are few amount of times it still fails to address following issues:


*   Slow for large vocabulary
*   cannot represent the semantics meaning of the sentences



**Cosine similarity**
If two words have similar meaning then their magnitude and direction shoulde be similar. We can calculate consine similarity which gives values from -1 to +1
+1 indicates that the varibales are prefectly similar and -1 indicates they are completely opposite.

In [None]:
def cosine_similarity(vector1, vector2):
  vector1=np.array(vector1)
  vector2=np.array(vector2)
  return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))


In [None]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer.get_feature_names_out())
print(bow_matrix.toarray())

['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']
[[0 0 0 0 0 0 1 0 1 1 1]
 [1 1 1 0 0 0 2 1 1 1 0]
 [0 0 0 1 1 1 1 0 1 1 0]]


In [None]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.6324555320336759
The cosine similarity between the documents  0 and 2 is:  0.6123724356957946
The cosine similarity between the documents  1 and 2 is:  0.5163977794943223


One hot vectors
Represents categorical values into numeric values. Assigns 1 to the position of the token and zero elsewherer

In [None]:
sentence = ["We are reading about Natural Language Processing Here"]
corpus2=pd.Series(sentence)
corpus2

0    We are reading about Natural Language Processi...
dtype: object

In [None]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming =False, stem_type = None,lemmatization = True, remove_stopwords =True)
preprocessed_corpus


  cleaned_corpus = pd.Series()
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

In [None]:
set_of_words = set()
for word in preprocessed_corpus[0].split():
    set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['process', 'read', 'language', 'natural']


In [None]:
preprocessed_corpus

['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

In [None]:
preprocessed_corpus[0]

'read natural language process'

In [None]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'process': 0, 'read': 1, 'language': 2, 'natural': 3}


In [None]:
one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()), len(vocab)))
one_hot_matrix.shape

(4, 4)

In [None]:
one_hot_matrix

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [None]:
for i, token in enumerate(preprocessed_corpus[0].split()):
    one_hot_matrix[i][position[token]] = 1
    print(position[token])


1
3
2
0


In [None]:
one_hot_matrix

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

Building a basic chatbot

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [44]:
!unzip /content/drive/MyDrive/MiniProjectdata/QA.zip

Archive:  /content/drive/MyDrive/MiniProjectdata/QA.zip
replace qa_Electronics.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: qa_Electronics.json     


Reading a json files and extracting the questions and answers


In [55]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


#loading questions and answers in separate lists
import ast
questions = []
answers = []
with open('qa_Electronics.json','r') as f:
    for line in f:
        data = ast.literal_eval(line)
        questions.append(data['question'].lower())
        answers.append(data['answer'].lower())

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(questions)
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)

we need to caculate teh consine similarity between teh tfidf vectors each row with the new questions and if the new questions and the vectorized question on teh tfidf vecotr then we can provide a answer and if the questions similarity are very low, lower then the certain threshold then we will descard the questions

In [57]:
def conversation(im):
    global tfidf, answers, X_tfidf
    Y_vec = vectorizer.transform(im)
    Y_tfidf = tfidf.fit_transform(Y_vec)
    cos_sim = np.rad2deg(np.arccos(max(cosine_similarity(Y_tfidf, X_tfidf)[0])))
    if cos_sim > 60 :
        return "sorry, I did not quite understand that"
    else:
        return answers[np.argmax(cosine_similarity(Y_tfidf, X_tfidf)[0])]



On the above consine_similarity return the array or spare representation of array
Argmax retrivese the index that contanins the maximum values

In [58]:
def main():
    usr = input("Please enter your username: ")
    print("support: Hi, welcome to Q&A support. How can I help you?")
    while True:
        im = input("{}: ".format(usr))
        if im.lower() == 'bye':
            print("Q&A support: bye!")
            break
        else:
            print("Q&A support: "+conversation([im]))

In [59]:
main()

Please enter your username: Angel
support: Hi, welcome to Q&A support. How can I help you?
Angel: what's my phone battery
Q&A support: this battery is intended for backup of the phone.
Angel: Does it have bluetooth?
Q&A support: no
Angel: bye
Q&A support: bye!
