# NLTK PROCESS

## Dependances

In [None]:
import re
import os 
import sys
from urllib.request import urlopen
import requests
import pprint


In [None]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns


In [None]:
from rich import print
from rich.console import Console
console=Console(record=True)

In [None]:
console.print(re.match('Jim','JimDowd'),style="bold black on white")

## Get Data

### from file

In [None]:
with open("../data/t1.txt") as f:   
    text = f.read()
text

### from url

In [None]:
# url = "https://www.hq.nasa.gov/alsj/LM03_Apollo_Spacecraft_AS1-6.pdf"
# # html = urlopen(url).read()

# html= requests.get(url)


# text = BeautifulSoup(html.text, 'html.parser').get_text()
# text[:200]

## Text Pre-processing

### Sentence tokenizing:

In [None]:
sentences=sent_tokenize(text)
console.print(f'Number of sentences:{len(sentences)}')

console.print(sentences,style="bold blue on white")

### Word tokenizing

In [None]:
words=word_tokenize(text)
console.print(f'Number of words: {len(words)}')

print(words)

### Find the frequency distribution

In [None]:
from nltk import FreqDist

# Find the frequency
fdist= FreqDist(words)

fdist.most_common(10)

### Plot the frequency graph

In [None]:
import matplotlib.pyplot as plt

fdist.plot(10)

### Remove punctuation marks

In [None]:

words_no_punc=[w for w in words if w.isalpha()==True]
print(words_no_punc[:50])
print(len(words_no_punc))

### Plotting graph without punctuation marks

In [None]:
fdist=FreqDist(words_no_punc)
fdist.most_common(10)

In [None]:
fdist.plot(10)

### List of stopwords

In [None]:
from nltk.corpus import stopwords

list_of_stopwords=stopwords.words("english")

print(list_of_stopwords)

### Removing stopwords

In [None]:
clean_words=[w for w in words_no_punc if w not in list_of_stopwords]

print(clean_words)

print(len(clean_words))



### Final frequency distribution

In [None]:
fdist=FreqDist(clean_words)

fdist.most_common(10)

In [None]:
fdist.plot(10)

## Word Cloud

In [None]:
from wordcloud import WordCloud

wordcloud=WordCloud().generate(text)


plt.figure(figsize=(12,12))
plt.imshow(wordcloud)

plt.axis('off')
plt.show()

In [None]:
clean_words_joined=' '.join(clean_words)

In [None]:
wordcloud=WordCloud().generate(clean_words_joined)


plt.figure(figsize=(12,12))
plt.imshow(wordcloud)

plt.axis('off')
plt.show()

## Stemming

In [None]:
from nltk.stem import PorterStemmer

porter=PorterStemmer()

word_list=['Programming','Programmers','Programmable','orbiter','electrical','studies','leaves','plays','am','is','were']

stemmed_words=[porter.stem(w) for w in word_list]

print(stemmed_words)

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()

pos_list=['v','n','a','r']
print(f'word_list--->{word_list}')
for i in pos_list: 
    lemmatized_words=[lemmatizer.lemmatize(w,pos=i) for w in word_list]

    print(f' POS:{i}\n{lemmatized_words}')

## Part of Speech Tagging (PoS tagging)

In [None]:
tag=nltk.pos_tag(word_list)
print(tag)

In [None]:
sentence='The three-person EO-3 crew docked with Salyut 7 on 9 February, 1984, and entered the darkened station carrying flashlights.'

# sentence="""Oversee and direct daily test activities at the Cameron Technology Development Center. 
# Lead test lab projects for Drilling and Production systems, R&D testing, product development, product design qualification, performance verification, 
# and product testing/demonstrations. Manage cross-functional team of engineers and technicians for 200+ annual test plans."""

sentence="""The LM while in powered descent is a cross between a helicopter and a spacecraft.
Jim Dowd will pilot the spacecraft."""

tokenized_words=word_tokenize(sentence)

tagged_words=nltk.pos_tag(tokenized_words)

tagged_words

## Chunking

In [None]:
grammar="NP : {<DT>?<JJ>*<NN>} "

parser=nltk.RegexpParser(grammar)

output= parser.parse(tagged_words)

print(output)

In [None]:
# output.draw()

## Chinking

In [None]:
grammar=r"""NP : {<.*>+}
}<JJ>+{"""
parser=nltk.RegexpParser(grammar)

output= parser.parse(tagged_words)

print(output)

In [None]:
# output.draw()

## Named Entity Recognition (NER)

In [None]:
sentence="""Jim Dowd served in the Mission Control Center as the System Engineering & Integration Office (SE&I) Representative for the Mission Evaluation Room (MER) console."""

tokenized_words=word_tokenize(sentence)

tagged_words=nltk.pos_tag(tokenized_words)
N_E_R=nltk.ne_chunk(tagged_words,binary=False)

print(N_E_R)

In [None]:
# N_E_R.draw()

## WordNet

In [None]:
from nltk.corpus import wordnet



In [None]:
for words in wordnet.synsets('car'):
    print(f'{words.name()}\n{words.definition()}\n{words.examples()}\n')

    for lemma in words.lemmas():
        print(lemma)
    print('\n')

###  Hypernyms: Hypernyms gives us a more abstract term for a word.

In [None]:
word=wordnet.synsets('rocket')[0]

print(word.hypernyms())

### Hyponyms: Hyponyms gives us a more specific term for a word.

In [None]:
word=wordnet.synsets('rocket')[0]

print(word.hyponyms())

In [None]:
[word.hyponyms()[i].name() for i in range(len(word.hyponyms()))]

### Get a name only

In [None]:
word=wordnet.synsets('rocket')[0]
print(word.lemmas()[0].name())

### Synonyms.

In [None]:
synonyms=[]

for words in wordnet.synsets('rocket'):
    for lemma in words.lemmas():
        synonyms.append(lemma.name())
synonyms

### Antonyms

In [None]:
antonyms=[]

for words in wordnet.synsets('Natural'):
    for lemma in words.lemmas():
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())
antonyms

### Synonyms and antonyms

In [None]:
synonyms=[]
antonyms=[]
for words in wordnet.synsets('Natural'):
    for lemma in words.lemmas():
        synonyms.append(lemma.name())
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())


print(f'synonyms:\n{synonyms}\n')
print(f'antonyms:\n{antonyms}\n')

###  Finding the similarity between words

In [None]:
word1=wordnet.synsets('ship','n')[0]

word2=wordnet.synsets('dog','n')[0]

word1.wup_similarity(word2)


# Bag-of-Words

In [None]:
# from collections import Counter

# text=text.lower()
# Counter(word_tokenize(text))

# Counter(text).most_common(2)
# Counter(word_tokenize(text)).most_common(2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# sentences=['Jim is a person', 'Jim likes to fly.', 'Jim wants to walk his dog.']

cv=CountVectorizer(stop_words='english',ngram_range=(1,1))

B_O_W= cv.fit_transform(sentences).toarray()
sum_words = B_O_W.sum(axis=0)
words_freq = [(word, sum_words[idx]) for word, idx in cv.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],reverse=True)


print(cv.vocabulary_)
print(cv.get_feature_names())
print(B_O_W)
print(f'BOW shape: {B_O_W.shape}')
print(sum_words)
print(words_freq)

In [None]:
n=20
top_words = words_freq[0:n]
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]

top_df.head()

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
### 

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer(norm=None,stop_words='english')

X=vectorizer.fit_transform(sentences).toarray()

print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())
print(X)

In [None]:
X.shape

In [None]:
# """Example of Python client calling Knowledge Graph Search API."""
# from __future__ import print_function
# import json
# import urllib

# api_key = open('.api_key').read()
# query = 'Taylor Swift'
# service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
# params = {
#     'query': query,
#     'limit': 10,
#     'indent': True,
#     'key': api_key,
# }
# url = service_url + '?' + urllib.urlencode(params)
# response = json.loads(urllib.urlopen(url).read())
# for element in response['itemListElement']:
#   print(element['result']['name'] + ' (' + str(element['resultScore']) + ')')
