## Extract NOUN Phrases from text

In [6]:
#Import Libraries
import nltk
from textblob import TextBlob

### Optional

In [13]:
nltk.download('brown')
nltk.download('punkt')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Extract noun

In [3]:
blob = TextBlob("Rahul is a great Machine Learning Engineer. He is specialized in Natural Language Processing.")

In [15]:
blob.noun_phrases


WordList(['rahul', 'machine learning engineer', 'language processing'])

## Similarities between words

- **Cosine similarity** -> cosine angle between two vectors.
- **Jaccard similarity**
- **Levenshtein distance**: Minimal number of 
insertions, deletions, and replacements required for 
transforming string “a” into string “b.”
- **Hamming distance**: Number of positions with the 
same symbol in both strings. But it can be defined 
only for strings with equal length.


### Cosine Similarity

In [23]:
documents = ( 
"I like Apple",
"I am exploring Apple devices",
"I am a beginner in Apple development", 
"I want to work for Apple", 
"I like Apple products"
)

In [16]:
#import libraries
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
tfid_vectorizer = TfidfVectorizer()
tfid_matrix = tfid_vectorizer.fit_transform(documents)

In [25]:
tfid_matrix.shape

(5, 13)

In [26]:
# similarity of 1st sentence with rest of the sentences
cosine_similarity(tfid_matrix[0:1], tfid_matrix)

array([[1.        , 0.14284054, 0.12305308, 0.11786255, 0.68374784]])

### Phonetic matchine

It matches two works and create alphanumeric string as encode version of the word. It is great for matching relavant names.

In [27]:
!pip install fuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzy
  Downloading Fuzzy-1.2.2.tar.gz (14 kB)
Building wheels for collected packages: fuzzy
  Building wheel for fuzzy (setup.py) ... [?25l[?25hdone
  Created wheel for fuzzy: filename=Fuzzy-1.2.2-cp37-cp37m-linux_x86_64.whl size=164017 sha256=6bd1982259720c570178b4cf52dffbd2e3fc99c7fc6ea8de07a5a93b3fcfbddf
  Stored in directory: /root/.cache/pip/wheels/c8/52/8a/bb2d05fbf343752a8546682cb5b2d775cc0d1f27f6c43f95dd
Successfully built fuzzy
Installing collected packages: fuzzy
Successfully installed fuzzy-1.2.2


In [28]:
import fuzzy

In [None]:
soundex = fuzzy.Soundex(5)
soundex("natural")

## Part Of Speech tagging

- Rule based
- Stochastic based

In [34]:
text = "I love programming. I love building softwares."

In [36]:
# import libraries
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [37]:
# Tokenizing the text
tokens = sent_tokenize(text)

In [38]:
tokens

['I love programming.', 'I love building softwares.']

In [40]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [41]:
# Tagging all the tokens
for token in tokens:
  words = nltk.word_tokenize(token)
  words = [w for w in words if not w in stop_words]
  # POS tagger
  tags = nltk.pos_tag(words)

tags

[('I', 'PRP'),
 ('love', 'VBP'),
 ('building', 'VBG'),
 ('softwares', 'NNS'),
 ('.', '.')]

**All tags**

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: “there is” ... think of it like 
“there exists”)
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective ‘big’
- JJR adjective, comparative ‘bigger’
- JJS adjective, superlative ‘biggest’
- LS list marker 1)
- MD modal could, will
- NN noun, singular ‘desk’
- NNS noun plural ‘desks’107
- NNP proper noun, singular ‘Harrison’
- NNPS proper noun, plural ‘Americans’
- PDT predeterminer ‘all the kids’
- POS possessive ending parent’s
- PRP personal pronoun I, he, she
- PRP$ possessive pronoun my, his, hers

- RB adverb very, silently
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO to go ‘to’ the store
- UH interjection
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP$ possessive wh-pronoun whose
- WRB wh-adverb where, when


## Extract Entities from Text

Identify and extract entities from text, also called **Named Entity Recognition**.

In [42]:
text = "Rahul is a very good footballer. He wants to play for his country."

In [43]:
# import libraries
import nltk
from nltk import ne_chunk
from nltk import word_tokenize

In [47]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [48]:
# NER
ne_chunk(nltk.pos_tag(word_tokenize(text)), binary=False)

ModuleNotFoundError: ignored

Tree('S', [Tree('GPE', [('Rahul', 'NNP')]), ('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ('good', 'JJ'), ('footballer', 'NN'), ('.', '.'), ('He', 'PRP'), ('wants', 'VBZ'), ('to', 'TO'), ('play', 'VB'), ('for', 'IN'), ('his', 'PRP$'), ('country', 'NN'), ('.', '.')])

## Extract topics from text

This is also known as topic modelling.

In [59]:
text1 = "I am learning Natural Language Processing. And one day, I want to become a Machine Learning Eningeer"
text2 = "Along with Machine Learning, I am also learning Pandas and Numpy and other data science stuffs"

docs = [text1, text2]
docs

['I am learning Natural Language Processing. And one day, I want to become a Machine Learning Eningeer',
 'Along with Machine Learning, I am also learning Pandas and Numpy and other data science stuffs']

In [62]:
# import libraries
!pip install genism
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement genism (from versions: none)[0m
[31mERROR: No matching distribution found for genism[0m


In [68]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [69]:
# preprocessing
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
  stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
  punc_free = "".join(ch for ch in stop_free if ch not in exclude)
  normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
  return normalized

docs_clean = [clean(doc).split() for doc in docs] 
docs_clean

[['learning',
  'natural',
  'language',
  'processing',
  'one',
  'day',
  'want',
  'become',
  'machine',
  'learning',
  'eningeer'],
 ['along',
  'machine',
  'learning',
  'also',
  'learning',
  'panda',
  'numpy',
  'data',
  'science',
  'stuff']]

In [70]:
import genism
from genism import corpora

# create term dictionary of corpus
# where the unique terms are assigned an index
dicts = corpora.Dictionary(doc_clean)

# Convert list of docs into doc term
# matrix using dictionary prepared above.
doc_term_matrix = [dicts.doc2bow(doc) for doc in doc_clean]
doc_term_matrix

ModuleNotFoundError: ignored

In [None]:
# Creating the LDA model
Lda = gensim.models.ldamodel.LdaModel
# Running and Training LDA model on the document term matrix 
# for 3 topics.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dicts, passes=50)
# Results
print(ldamodel.print_topics())