# Introduction to Natuaral Language Processing

### Install NLTK
   - pip install nltk

In [1]:
import nltk

In [2]:
 #   nltk.download()

In [3]:
# corpus is a large collection of text
# from nltk.corpus import brown

In [4]:
# nltk.download('brown')
from nltk.corpus import brown

In [5]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [6]:
# to get sentences from the corpus
data = brown.sents(categories='adventure')

In [7]:
# data
print(len(data))
data[1]

4637


['He', 'was', 'well', 'rid', 'of', 'her', '.']

In [8]:
' '.join(data[1])

'He was well rid of her .'

# Bag of words Pipeline
- convert any given sentence into list of numbers

 - Get the data/corpus
 - Tokenization, stopward Removal
 - Stemming/Lemmetization
 - Building a vocab
 - vectorization
 - classification

# Tokenization & Stopward Removal

In [9]:
document = """It was a very pleasant day. The weather The weather was cool and there were light showers.
              I went to the market to buy some frutes."""
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [10]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [11]:
sents = sent_tokenize(document)
print(sents)

['It was a very pleasant day.', 'The weather The weather was cool and there were light showers.', 'I went to the market to buy some frutes.']


In [12]:
sents[0]

'It was a very pleasant day.'

In [13]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [14]:
words = word_tokenize(sentence)
print(words)

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'prateek', '@', 'cb.com']


In [15]:
# stop words
from nltk.corpus import stopwords

In [16]:
sw = set(stopwords.words('english'))

In [17]:
print(sw)

{'wouldn', "it's", "shouldn't", 'only', 'or', 'was', 'yourselves', 'myself', 'as', 'these', 'wasn', 'where', 'you', 'nor', "wouldn't", 'have', 'now', 'needn', "hadn't", 'y', 'how', 'don', 'again', "haven't", 'o', 'very', 'while', 'theirs', 'down', 'this', 'of', 'for', "you're", 'all', 'most', 'weren', "mightn't", 'ourselves', 'be', "aren't", 'then', 'is', "you'll", 'it', 'when', 'his', 'isn', 'into', "won't", 'against', "that'll", 'hadn', "doesn't", 'a', 's', "don't", 'yourself', 'your', 'no', 'below', "hasn't", 'himself', 'hers', 'themselves', 'more', 'on', 'having', 'same', 'can', 'above', 'him', 'why', 'through', 'he', 'whom', 'which', 'so', 't', 'ours', 'am', 'her', 'i', "should've", 'own', 'other', 'those', 're', 'we', 'has', 'because', 'and', 'in', 'won', 'ain', 'that', 'over', 'didn', 'should', 'mustn', 'been', 'shouldn', 'm', 'too', "needn't", 'are', 'there', 'not', 'an', 'she', 'to', "she's", 'aren', 'mightn', 'after', 'with', 'from', "you'd", 'herself', 'once', 'doesn', "shan

In [18]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [19]:
text = "I am not bothered about her very much.".split() # send as a list of words
useful_text = remove_stopwords(text,sw)
print(useful_text)

['I', 'bothered', 'much.']


#### Tokenizing using Regular Expression(REGEX)

In [20]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [21]:
from nltk.tokenize import RegexpTokenizer

In [22]:
tokenizer = RegexpTokenizer('[a-zA-Z@]+')
Useful_text = tokenizer.tokenize(sentence)

In [23]:
Useful_text # all the numbers there were removed

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb',
 'com']

# Stemming
- Process that transforms particular words(verbs,prurals) into their from
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example - jumps,jumping,jumped,jump --> jump

In [24]:
text = """Foxes love to make jumps.The quick brown fox was seen jumpung over the lovely dog
          from a 6ft feet high wall"""

In [25]:
# Snowball Stemmer, Porter Stemmer, Lancaster Stemmer
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [26]:
ps = PorterStemmer()

In [27]:
ps.stem('jumping')

'jump'

In [28]:
ps.stem('jumps')

'jump'

In [29]:
ps.stem('loving')

'love'

In [30]:
# Snowball Stemmer (Multilingual)
ss = SnowballStemmer('english')

In [31]:
ss.stem('lovely')

'love'

In [32]:
# Lemetization
import nltk
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\H3RMIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
wn.lemmatize('jumping')

'jumping'

In [34]:
dir(wn) # available functions inside the wordnet object

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 'lemmatize',
 'unicode_repr']

# Building a vocab & Vectorization

In [35]:
# sample corpus contains 4 documents, each document can have 1 or more sentences
corpus = [
          'Indian cricket team will win world cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka',
          'We will win next Lok Shabha Elections, says confident Indian PM',
          'The nobel laurate won the hearts of the people.',
          'The movie Raazi is an exciting Indian Spy thriller based upon areal story'
        ]

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
cv = CountVectorizer()

In [38]:
# dir(cv)
vectorized_corpus = cv.fit_transform(corpus)

In [39]:
vectorized_corpus

<4x41 sparse matrix of type '<class 'numpy.int64'>'
	with 47 stored elements in Compressed Sparse Row format>

In [40]:
vectorized_corpus = vectorized_corpus.toarray()

In [41]:
vectorized_corpus

array([[0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 0, 2],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [42]:
vectorized_corpus[1]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
      dtype=int64)

In [43]:
cv.vocabulary_  # mapping of words

{'indian': 13,
 'cricket': 7,
 'team': 31,
 'will': 37,
 'win': 38,
 'world': 40,
 'cup': 8,
 'says': 26,
 'capt': 5,
 'virat': 35,
 'kohli': 15,
 'be': 4,
 'held': 12,
 'at': 2,
 'sri': 29,
 'lanka': 16,
 'we': 36,
 'next': 20,
 'lok': 18,
 'shabha': 27,
 'elections': 9,
 'confident': 6,
 'pm': 24,
 'the': 32,
 'nobel': 21,
 'laurate': 17,
 'won': 39,
 'hearts': 11,
 'of': 22,
 'people': 23,
 'movie': 19,
 'raazi': 25,
 'is': 14,
 'an': 0,
 'exciting': 10,
 'spy': 28,
 'thriller': 33,
 'based': 3,
 'upon': 34,
 'areal': 1,
 'story': 30}

In [44]:
len(cv.vocabulary_.keys())

41

In [45]:
# Reverse Mapping
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [48]:
s = cv.inverse_transform(numbers) # words are jumbled up because its a bag of words model
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


### More ways to use features
- Unigrams - every word as feature
- Bigrams - two consecutive words can be used as a feature
- Trigrams
- n-grams
- TF-IDF Normalizsation

In [49]:
sent_1 = ["this is a good movie"]
sent_2 = ["this is not good movie"]

In [54]:
cv = CountVectorizer(ngram_range=(2,2)) # by default 1,1 # we are now maing bigram features
# this will increas the size of the array but the effect of negation will be captured

In [55]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[1, 1, 0, 0, 1],
       [1, 0, 1, 1, 1]], dtype=int64)

In [56]:
cv.vocabulary_

{'this is': 4, 'is good': 1, 'good movie': 0, 'is not': 2, 'not good': 3}

### TF-IDF Normalisation(Term Frequency,Inverse Document Frequency)
- Avoid features that occur very often; because they contain less information
- Information decreases as the number of occurensses increase across different type of document
- So we define another term - term-document-frequency which associates a weight with every 

In [57]:
sent_1 = 'this is a good movie'
sent_2 = 'this was a good movie'
sent_3 = 'this is not good movie'

In [None]:
corpus = [sent_1,sent_2,sent_3]

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
tfidf = TfidfVectorizer()

In [65]:
vc = tfidf.fit_transform(corpus).toarray()
print(vc)

[[0.         0.         0.21254013 0.         0.21254013 0.21254013
  0.         0.21254013 0.42508027 0.         0.         0.
  0.21254013 0.13566161 0.         0.21254013 0.21254013 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.16756908 0.         0.         0.21254013
  0.         0.21254013 0.         0.         0.         0.21254013
  0.         0.33513816 0.16756908 0.         0.42508027]
 [0.         0.         0.         0.         0.         0.
  0.32840433 0.         0.         0.32840433 0.         0.
  0.         0.20961623 0.         0.         0.         0.
  0.32840433 0.         0.32840433 0.         0.         0.
  0.32840433 0.         0.25891775 0.32840433 0.         0.
  0.         0.         0.         0.         0.         0.
  0.32840433 0.25891775 0.25891775 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.29368184
  0.        

In [66]:
tfidf.vocabulary_

{'indian': 13,
 'cricket': 7,
 'team': 31,
 'will': 37,
 'win': 38,
 'world': 40,
 'cup': 8,
 'says': 26,
 'capt': 5,
 'virat': 35,
 'kohli': 15,
 'be': 4,
 'held': 12,
 'at': 2,
 'sri': 29,
 'lanka': 16,
 'we': 36,
 'next': 20,
 'lok': 18,
 'shabha': 27,
 'elections': 9,
 'confident': 6,
 'pm': 24,
 'the': 32,
 'nobel': 21,
 'laurate': 17,
 'won': 39,
 'hearts': 11,
 'of': 22,
 'people': 23,
 'movie': 19,
 'raazi': 25,
 'is': 14,
 'an': 0,
 'exciting': 10,
 'spy': 28,
 'thriller': 33,
 'based': 3,
 'upon': 34,
 'areal': 1,
 'story': 30}