In [6]:
# notebook imports

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aayush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
X = ("Computers can analyze text computers",
       "They do it using vectors and matrices",
       "Computers can process massive amounts of text data")
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_)
print(X_vec.todense())

{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
[[0 1 2 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


In [3]:
# Once text data is converted into a matrix, we can apply any matrix operation to it 

### Exploring Bag of Words architecture

In [4]:
# uses frequency of words present in the document

In [7]:
# make a list of sentences

In [8]:
sentences = ["We are reading about Natural Language Processing Here",
            "Natural Language Processing making computers comprehend language data",
            "The field of Natural Language Processing is evolving everyday"]

In [9]:
# create series for the list

In [10]:
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

In [13]:
corpus[0].split()

['We', 'are', 'reading', 'about', 'Natural', 'Language', 'Processing', 'Here']

# data preprocessing

In [14]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [17]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word) # removing wh words from the set of stopwords
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [16]:
stopwords_removal(corpus)

[['We', 'reading', 'Natural', 'Language', 'Processing', 'Here'],
 ['Natural',
  'Language',
  'Processing',
  'making',
  'computers',
  'comprehend',
  'language',
  'data'],
 ['The', 'field', 'Natural', 'Language', 'Processing', 'evolving', 'everyday']]

In [18]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [19]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [84]:
from nltk.util import ngrams

In [97]:
def n_gram(corpus,n):
    tokens = [x.split() for x in corpus]
    ng = [ngrams(token,n) for token in tokens ]
    corpus= [" ".join(token) for token in ng]
    return corpus
    

In [112]:
tokens = [x.split() for x in corpus]
[token[:] for token in tokens]

[['We',
  'are',
  'reading',
  'about',
  'Natural',
  'Language',
  'Processing',
  'Here'],
 ['Natural',
  'Language',
  'Processing',
  'making',
  'computers',
  'comprehend',
  'language',
  'data'],
 ['The',
  'field',
  'of',
  'Natural',
  'Language',
  'Processing',
  'is',
  'evolving',
  'everyday']]

In [104]:
cor = n_gram(corpus,2)

TypeError: sequence item 0: expected str instance, tuple found

In [79]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True,n_gram= True, n=1):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    if n_gram == True:
        corpus = n_gram(corpus,n)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [80]:
common_dot_words = ['U.S.', 'Mr.', 'Mrs.', 'D.C.']

In [81]:
text_clean(corpus,common_dot_words)

  cleaned_corpus = pd.Series()


0    we are reading about natural language processi...
0    natural language processing making computers c...
0    the field of natural language processing is ev...
dtype: object

In [82]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  cleaned_corpus = pd.Series()


TypeError: 'bool' object is not callable

In [50]:
def get_part_of_speech_tags(token):
    tag_dict = {"J": wordnet.ADJ,
                   "N": wordnet.NOUN,
                   "V": wordnet.VERB,
                   "R": wordnet.ADV}
    tag = nltk.pos_tag([token])[0][1][0].upper() # taking the first letter from the POS 
    return tag_dict.get(tag, wordnet.NOUN) # present in the dict then corssesponding wordner else noun

In [53]:
# import nltk
# from nltk.corpus import wordnet

In [54]:
# s="We are putting our effort to enhance our understanding of Lemmatization"

In [55]:
# token_list = s.split()

In [56]:
# [get_part_of_speech_tags(token) for token in token_list]

In [57]:
# tag_dict = {"J": wordnet.ADJ,
#                    "N": wordnet.NOUN,
#                    "V": wordnet.VERB,
#                    "R": wordnet.ADV}

In [58]:
# [nltk.pos_tag([token])[0][1][0].upper() for token in token_list]

In [52]:
# [nltk.pos_tag([token]) for token in token_list]

In [51]:
# tag_dict.get('T',wordnet.NOUN)

### Building the vocabulary

In [59]:
set_of_words = set()
for sentence in preprocessed_corpus: 
    for word in sentence.split():
        set_of_words.add(word) # for each unique word, place it in the set
vocab = list(set_of_words) # make the list out of the words
print(vocab)

['make', 'read', 'data', 'everyday', 'language', 'evolve', 'computers', 'natural', 'comprehend', 'process', 'field']


In [60]:
# giving the postion for each unique words in the vocabulary
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'make': 0, 'read': 1, 'data': 2, 'everyday': 3, 'language': 4, 'evolve': 5, 'computers': 6, 'natural': 7, 'comprehend': 8, 'process': 9, 'field': 10}


In [61]:
# creating a matrix to hold bag of words representation

In [62]:
bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab))) # no of docs x total no of unique words in vocab

In [63]:
bow_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [64]:
for i, preprocessed_sentence in enumerate(preprocessed_corpus):
    for token in preprocessed_sentence.split():   
        bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1


In [65]:
bow_matrix

array([[0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 2., 0., 1., 1., 1., 1., 0.],
       [0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1.]])

In [66]:
# this is how BOW apprach works