# Dependancies

In [None]:
import nltk
from nltk.corpus import treebank
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk import ne_chunk
from textblob import TextBlob


import re
import urllib.request
from bs4 import BeautifulSoup
import requests
import os

# Classes

## NLP Analysis of Text

In [None]:
class NLP_My_Words():
    """Tokenize and analize text"""
    def __init__(self,text,case='y'):
        
        if (case!='y'):
            self.text=text
        else:
            self.text=text.lower()
        
        raw_word_count=len(self.text.split())
        print(f'Raw text word count: {raw_word_count}\n')
        self.tokenization()
        
    def tokenization(self):
        """Tokenization"""
        self.tokens = nltk.word_tokenize(self.text)
        punctuation=re.compile(r'[-.?!,:;()|=/*"><\{}$&]')

        post_punctuation=[]
        for words in self.tokens:
            word=punctuation.sub('',words)
            if len(word)>0:
                post_punctuation.append(word)

        self.tokens=post_punctuation

        self.nltk_text=nltk.Text(self.tokens)
        
        self.remove_stop_words()


    def sentenance_tokenization(self):
        """Sentenance Tokenization"""
    
        self.sentenance_tokens=sent_tokenize(self.text)

        return self.sentenance_tokens
    
    
    def word_freq(self):
        """Word Frequency"""
    
        freq = nltk.FreqDist(self.tokens)

        for key,val in freq.items():

            print (str(key) + ':' + str(val))

        freq.plot(30, cumulative=False)
        
        self.words_tagged()
        
    def remove_stop_words(self):
        """Stop Words"""
    
        clean_tokens = self.tokens[:]

        sr = stopwords.words('english')

        for token in self.tokens:

            if token in stopwords.words('english'):

                clean_tokens.remove(token)
        self.tokens=clean_tokens
        self.sorted_tokens=sorted(set(self.tokens))
        num_words=len(self.tokens)
        print(f'Tokens: {self.tokens}\n')
        print(f'Tokens (sorted):{self.sorted_tokens}\n')
        print(f'Word count following stop words: {num_words}\n')
        self.word_freq()
    
    
    def words_tagged(self):
        """Word Tagging"""
    
        self.tagged = nltk.pos_tag(self.tokens)
#         tagged[0:6]

        print(f'Word Tags:{self.tagged}\n')
    
        self.Named_Entity_Recognition()
    
    
    def Named_Entity_Recognition(self):
        """Named Entity Recognition"""
        
        self.NER = ne_chunk(self.tagged)
        print(f'Named Entity Recognition:\n{self.NER}')
    
    def lexical_diversity(self):
        """Percentage of word occurance"""
        
        lex_div=len(set(self.tokens))/len(self.tokens)
        print(f'Lexical Diversity:{(lex_div)*100}\n')
        
        

## Scrapping Websites

In [None]:
class Web_Scrapping():
    """Web Scrapping"""
    
    def __init__(self,site):
        self.site=site
        self.web_scrape_html()
      
    def web_scrape_html(self):
        """Retrieve site html"""
    
        import urllib.request

        response = urllib.request.urlopen(self.site)

        self.html = response.read()

    #     print (html)

        self.web_scrape_text()
    
    def web_scrape_text(self):
        """Return text from the site"""
        
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(self.html,"html5lib")

        self.text = soup.get_text(strip=True)
        
#       print(self.text)

        self.analysis_of_site_text()
    
    def analysis_of_site_text(self):
        """Call NLP Analysis class"""
        analysis=NLP_My_Words(self.text)
        
        
    

## Wordnet

In [None]:
class Wordnet_Stuff():
    
    def __init__(self,word):
        self.word=word
        print(f'The word is:{self.word}\n')
        self.definitions()
        self.syn_lem()
        self.synonym()
    def definitions(self):
        """Definition of the word"""
        from nltk.corpus import wordnet

        syn = wordnet.synsets(self.word)
        print(f'Definition:{syn[0].definition()}\n')
        
    def syn_lem(self):
        
        from nltk.corpus import wordnet

        synonyms = []
        for syn in wordnet.synsets(self.word):

            for lemma in syn.lemmas():

                synonyms.append(lemma.name())

        print(f'sny_lem:{synonyms}\n')
        
    def synonym(self):
    
        from nltk.corpus import wordnet

        syn = wordnet.synsets(self.word)

        print(f'syn Definition:{syn[0].definition()}\n')

        print(f'syn Examples:{syn[0].examples()}\n')


## TextBlog

In [None]:
class Sentiment():
    """Conduct sentiment scores"""
    
    def __init__(self,text):
        self.text=text
    
    def call_sentenance_tokens(self):
        """option"""
        pass
    
    def perform_sentiment(self):
        
        TextBlob(text).sentiment
        
        pol=lambda x: TextBlob(text).sentiment.polarity
        subj=lambda x: TextBlob(text).sentiment.subjectivity
        
        the_polority=pol(x)
        
        the_subjectivity=subj(x)
        
        y=f'Polority:{the_polarity}\nSubjectivity:{subj}'
        
        print(y)
        

# Functions

In [None]:
def url_to_transcript(url):
    """Returns specific text data from a url"""
    page=requests.get(url).text
    soup=BeautifulSoup(page,"html5lib")
    text=[p.text for p in soup.find(class_="shortDescription-col").find_all('p')]
    print(url)
    return text

In [None]:
def read_url_text(url):
    
    from urllib import request
    
    response = request.urlopen(url)
    raw = response.read().decode('utf8')
    
    print(f'Length of the text file:{len(raw)}\n')
    
    return raw[:1000]

# Main

## Text Data

In [None]:
os.startfile('C:/Users/Crystal/Desktop/sidebars/nlp_text_dump.txt')

In [None]:
base='C:/Users/Crystal/Desktop/sidebars/nlp_text_dump.txt'
other='C:/Users/Crystal/Desktop/sidebars/Dowd Jimmie - 09022019.txt'
f = open(base)
text = f.read()


In [None]:
w=NLP_My_Words(text,'y')

In [None]:
w.nltk_text

In [None]:
w.nltk_text.collocations()

In [None]:
w.nltk_text.concordance('park')

In [None]:
w.nltk_text.similar('poop')

In [None]:
w.nltk_text.common_contexts(['dog','jim'])

In [None]:
w.nltk_text.dispersion_plot(['jim','dog','walked'])

In [None]:
w.nltk_text.plot()

In [None]:
w.lexical_diversity()

## Scrapping

In [None]:
site='https://www.oreilly.com/online-learning/enterprise.html'

In [None]:
s=Web_Scrapping(site=site)

## Word Study

In [None]:
word_study=Wordnet_Stuff('Natural')

## Sentiment Analysis

In [None]:
sent_analy=Sentiment(text)

# Extras

In [None]:
TextBlob(text).sentiment

pol=lambda x: TextBlob(text).sentiment.polarity

In [None]:
pol(text)

In [None]:
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

In [None]:
url_to_transcript(site)

In [None]:
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
raw=read_url_text(url)

In [None]:
z=NLP_My_Words(raw)

In [None]:
q=['natural', 'natural', 'cancel', 'natural', 'natural', 'natural', 'natural', 'natural', 'natural', 'natural', 'instinctive', 'natural', 'raw', 'rude', 'natural', 'natural', 'born', 'innate', 'lifelike', 'natural']

In [None]:
set(q)

In [None]:
sorted(set(q))