In [6]:
# notebook imports

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aayush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [219]:
from sklearn.feature_extraction.text import CountVectorizer
X = ("Computers can analyze text computers",
       "They do it using vectors and matrices",
       "Computers can process massive amounts of text data")
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_)
print(X_vec.todense())

{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
[[0 1 2 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


In [220]:
# Once text data is converted into a matrix, we can apply any matrix operation to it 

### Exploring Bag of Words architecture

In [221]:
# uses frequency of words present in the document

In [222]:
# make a list of sentences

In [223]:
sentences = ["We are reading about Natural Language Processing Here",
            "Natural Language Processing making computers comprehend language data",
            "The field of Natural Language Processing is evolving everyday"]

In [224]:
# create series for the list

In [225]:
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

In [226]:
corpus[0].split()

['We', 'are', 'reading', 'about', 'Natural', 'Language', 'Processing', 'Here']

# data preprocessing

In [227]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [228]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word) # removing wh words from the set of stopwords
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [229]:
stopwords_removal(corpus)

[['We', 'reading', 'Natural', 'Language', 'Processing', 'Here'],
 ['Natural',
  'Language',
  'Processing',
  'making',
  'computers',
  'comprehend',
  'language',
  'data'],
 ['The', 'field', 'Natural', 'Language', 'Processing', 'evolving', 'everyday']]

In [230]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [231]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [232]:
from nltk.util import ngrams

In [233]:
def n_gram(corpus,n):
    corpus = x.split() for x in corpus
    ng = list([ngrams(x,n) for x in x] for x in corpus)
    corpus= [" ".join(token) for token in ng]
    return corpus
    

SyntaxError: invalid syntax (<ipython-input-233-b92b935b602c>, line 2)

In [234]:
# n_gram(corpus,2)

In [273]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
#     if n_gram == True:
#         corpus = n_gram(corpus,n)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [236]:
common_dot_words = ['U.S.', 'Mr.', 'Mrs.', 'D.C.']

In [237]:
text_clean(corpus,common_dot_words)

  cleaned_corpus = pd.Series()


0    we are reading about natural language processi...
0    natural language processing making computers c...
0    the field of natural language processing is ev...
dtype: object

In [238]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  cleaned_corpus = pd.Series()


TypeError: 'bool' object is not callable

In [239]:
def get_part_of_speech_tags(token):
    tag_dict = {"J": wordnet.ADJ,
                   "N": wordnet.NOUN,
                   "V": wordnet.VERB,
                   "R": wordnet.ADV}
    tag = nltk.pos_tag([token])[0][1][0].upper() # taking the first letter from the POS 
    return tag_dict.get(tag, wordnet.NOUN) # present in the dict then corssesponding wordner else noun

In [240]:
# import nltk
# from nltk.corpus import wordnet

In [241]:
# s="We are putting our effort to enhance our understanding of Lemmatization"

In [242]:
# token_list = s.split()

In [243]:
# [get_part_of_speech_tags(token) for token in token_list]

In [244]:
# tag_dict = {"J": wordnet.ADJ,
#                    "N": wordnet.NOUN,
#                    "V": wordnet.VERB,
#                    "R": wordnet.ADV}

In [245]:
# [nltk.pos_tag([token])[0][1][0].upper() for token in token_list]

In [246]:
# [nltk.pos_tag([token]) for token in token_list]

In [247]:
# tag_dict.get('T',wordnet.NOUN)

### Building the vocabulary

In [248]:
set_of_words = set()
for sentence in preprocessed_corpus: 
    for word in sentence.split():
        set_of_words.add(word) # for each unique word, place it in the set
vocab = list(set_of_words) # make the list out of the words
print(vocab)

['make', 'read', 'data', 'everyday', 'language', 'evolve', 'computers', 'natural', 'comprehend', 'process', 'field']


In [249]:
# giving the postion for each unique words in the vocabulary
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'make': 0, 'read': 1, 'data': 2, 'everyday': 3, 'language': 4, 'evolve': 5, 'computers': 6, 'natural': 7, 'comprehend': 8, 'process': 9, 'field': 10}


In [250]:
# creating a matrix to hold bag of words representation

In [251]:
bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab))) # no of docs x total no of unique words in vocab

In [252]:
bow_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [253]:
for i, preprocessed_sentence in enumerate(preprocessed_corpus):
    for token in preprocessed_sentence.split():   
        bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1


In [254]:
vocab

['make',
 'read',
 'data',
 'everyday',
 'language',
 'evolve',
 'computers',
 'natural',
 'comprehend',
 'process',
 'field']

In [255]:
bow_matrix

array([[0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 2., 0., 1., 1., 1., 1., 0.],
       [0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1.]])

In [154]:
for x in bow_matrix:
    for y in x:
        print(y)

0.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
2.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0


In [157]:
pd.DataFrame(bow_matrix, columns=vocab)

Unnamed: 0,make,read,data,everyday,language,evolve,computers,natural,comprehend,process,field
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0
2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0


In [66]:
# this is how BOW apprach works

In [257]:
from sklearn.feature_extraction.text import CountVectorizer


In [258]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [259]:
print(vectorizer.get_feature_names())
print(bow_matrix.toarray())

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0 0 0 0 0 0 1 0 1 1 1]
 [1 1 1 0 0 0 2 1 1 1 0]
 [0 0 0 1 1 1 1 0 1 1 0]]


In [158]:
# using n-gram

In [159]:
vectorizer_ngram_range = CountVectorizer(analyzer='word', ngram_range=(1,3))
bow_matrix_ngram = vectorizer_ngram_range.fit_transform(preprocessed_corpus)

In [161]:
print(vectorizer_ngram_range.get_feature_names())
print(bow_matrix_ngram.toarray())

['comprehend', 'comprehend language', 'comprehend language data', 'computers', 'computers comprehend', 'computers comprehend language', 'data', 'everyday', 'evolve', 'evolve everyday', 'field', 'field natural', 'field natural language', 'language', 'language data', 'language process', 'language process evolve', 'language process make', 'make', 'make computers', 'make computers comprehend', 'natural', 'natural language', 'natural language process', 'process', 'process evolve', 'process evolve everyday', 'process make', 'process make computers', 'read', 'read natural', 'read natural language']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 2 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]]


In [163]:
# very big dimensionality does not convert into a very good model; rather, 
# it can hamper the model's inference ability. This is referred to as the curse of dimensionality
# and it can potentially lead to a condition called overfitting, 

In [164]:
#  only six of the most frequently occurring n-grams among unigrams, bigrams, or trigrams in the corpus were 
# selected since the value of the max_features attribute was set to 6.

In [179]:
vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3), max_df = 3, min_df = 2)
bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)

In [180]:
print(vectorizer_max_features.get_feature_names())
print(bow_matrix_max_features.toarray())

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]


In [187]:
preprocessed_corpus

['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

### TF-IDF

In [181]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [182]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [184]:
print(vectorizer.get_feature_names())

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']


In [185]:
print(tf_idf_matrix.toarray())

[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]


In [186]:
print('the shape of tf-idf matrix is: ',tf_idf_matrix.shape)

the shape of tf-idf matrix is:  (3, 11)


In [188]:
# for the word natural which occurs once in every document 
# the TF-IDF weight for the term is different across the documents because even though the IDF would remain the
#same across the documents
# for natural, the TF would change since the size of each document is different and the TF component gets normalized 
# based on that

In [189]:
vectorizer = TfidfVectorizer(norm='l1')
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [190]:
print(vectorizer.get_feature_names())
print(tf_idf_matrix.toarray())

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0.         0.         0.         0.         0.         0.
  0.21307663 0.         0.21307663 0.21307663 0.3607701 ]
 [0.1571718  0.1571718  0.1571718  0.         0.         0.
  0.1856564  0.1571718  0.0928282  0.0928282  0.        ]
 [0.         0.         0.         0.2095624  0.2095624  0.2095624
  0.12377093 0.         0.12377093 0.12377093 0.        ]]


In [191]:
print('the shape of tf-idf matrix is: ',tf_idf_matrix.shape)

the shape of tf-idf matrix is:  (3, 11)


In [192]:
# for l2 norm -- sum of squares of the vector elements is equal to 1.
# for l1 norm -- sum of absolute values of the vector elements is 1 with the l1 norm

In [194]:
# n-grams and max features with tfidf vectorizer

In [195]:
vectorizer_n_gram_max_features = TfidfVectorizer(norm="l2", analyzer='word', ngram_range=(1,3), max_features = 6)
tf_idf_matrix_n_gram_max_features = vectorizer_n_gram_max_features.fit_transform(preprocessed_corpus)


In [196]:
print(vectorizer_n_gram_max_features.get_feature_names())
print(tf_idf_matrix_n_gram_max_features.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_n_gram_max_features.shape)

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]
 [0.66666667 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]
 [0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]

The shape of the TF-IDF matrix is:  (3, 6)


In [198]:
#  took the top six features among unigrams, bigrams, and trigrams, and used them to represent the TF-IDF vectors.

In [199]:
# advantages of tf-idf:
# computationally fast 
# scales weight of less frequently occuring terms

In [201]:
# disadvantages:
# does not take into account things such as the co-occurrence of terms, semantics, the context associated with terms,
# and the position of a term in a document.
# depends on vocabulary size, really  slow if large vocab size

### Similarity calculation between document vectors

In [202]:
# If the words being used in two documents are similar, it indicates that the documents are similar as well.

####  cosine similarity

In [203]:
# implementation

In [208]:
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return (np.dot(vec1,vec2)/(np.sqrt(np.sum(vec1**2))*np.sqrt(np.sum(vec2**2))))

In [264]:
# pd.options.display.max_columns=None

In [263]:
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

In [209]:
# cosine similariy on vecotrs developed using Countvectorizer

In [211]:
bow_matrix

array([[0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 2., 0., 1., 1., 1., 1., 0.],
       [0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1.]])

In [260]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.6324555320336759
The cosine similarity between the documents  0 and 2 is:  0.6123724356957946
The cosine similarity between the documents  1 and 2 is:  0.5163977794943223


In [265]:
# document 0 and document 1 are the closest or most similar, while document 1 and document 2 are the farthest 
# or least similar:

In [213]:
tf_idf_matrix

<3x11 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [212]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, tf_idf_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",cosine_similarity(tf_idf_matrix.toarray()[i], tf_idf_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.39514115766749114
The cosine similarity between the documents  0 and 2 is:  0.3636545567376186
The cosine similarity between the documents  1 and 2 is:  0.2810071916500233


In [266]:
# though magnitude differs the order is same as comapred to vectors generated by bow approach

In [267]:
# the cosine similarity is actually helping to measure BoW overlap across documents

### One-hot Vectorization

In [268]:
# all values are 0 except the one where the token is present, and this entry is marked 1. 
# As you may have guessed, these are binary vectors.

In [269]:
sentence = ['We are reading about Natural Language Processing here']

In [271]:
corpus = pd.Series(sentence)
corpus

0    We are reading about Natural Language Processi...
dtype: object

In [274]:
# preprocessing with Lemmatization 

preprocessed_corpus = preprocess(corpus, keep_list=[], stemming=False, stem_type=None, lemmatization=True,remove_stopwords=True )

  cleaned_corpus = pd.Series()


In [275]:
preprocessed_corpus

['read natural language process']

In [276]:
preprocessed_corpus[0]

'read natural language process'

In [277]:
set_of_words = set()
for word in preprocessed_corpus[0].split():
    set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['language', 'read', 'process', 'natural']


In [285]:
position ={}
for i,token in enumerate(vocab):
    position[token]=i
print(position)

{'language': 0, 'read': 1, 'process': 2, 'natural': 3}


In [286]:
one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()),len(vocab)))
one_hot_matrix.shape

(4, 4)

In [287]:
# 4 rows, one for each word/token in preprocessed_corpus
# 4 columns, one for each unique words in the vocab

In [288]:
preprocessed_corpus[0].split()

['read', 'natural', 'language', 'process']

In [289]:
# position[read]

In [290]:
for i,token in enumerate(preprocessed_corpus[0].split()):
    one_hot_matrix[i][position[token]] = 1

In [291]:
one_hot_matrix

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

### Building a Basic Chatbot