In [2]:
import nltk

In [None]:
nltk.download_shell()

In [None]:
nltk.download ('gutenberg')

In [None]:
gb = nltk.corpus.gutenberg
print ("Gutenberg files:", gb.fileids ())


In [None]:
macbeth = nltk.corpus.gutenberg.words ('shakespeare-macbeth.txt')

In [None]:
len (macbeth)

In [None]:
macbeth [:10]

In [None]:
macbeth_sents = nltk.corpus.gutenberg.sents ('shakespeare-macbeth.txt')
macbeth_sents [: 5]


In [None]:
text = nltk.Text(macbeth)
text.concordance('Stage')


In [None]:
text.common_contexts(['Stage'])

In [None]:
text.similar('Stage')

In [None]:
fd = nltk.FreqDist(macbeth)

In [None]:
fd.most_common(10)

In [None]:
nltk.download('stopwords')

In [None]:
sw = set(nltk.corpus.stopwords.words ('english'))
print(len(sw))
list(sw) [:10]


In [None]:
macbeth_filtered = [w for w in macbeth if w.lower() not in sw]
fd = nltk.FreqDist (macbeth_filtered)
fd.most_common(10)


In [None]:
import string
punctuation = set (string.punctuation)
macbeth_filtered2 = [w.lower () for w in macbeth if w.lower () not in sw and w.lower () not in punctuation]


In [None]:
fd = nltk.FreqDist (macbeth_filtered2)
fd.most_common(10)


In [None]:
long_words = [w for w in macbeth if len(w)> 12]

In [None]:
sorted(long_words)

In [None]:
ious_words = [w for w in macbeth if 'ious' in w]
ious_words = set(ious_words)
sorted(ious_words)


In [None]:
bgrms = nltk.FreqDist(nltk.bigrams(macbeth_filtered2))
bgrms.most_common(15)


In [None]:
tgrms = nltk.FreqDist(nltk.trigrams (macbeth_filtered2))
tgrms.most_common(10)


# Preprocessing steps

In [None]:
text = 'This is a Demo Sentence'
lower_text = text.lower()
lower_text

In [None]:
nltk.download('punkt')

text = 'This is a Demo Sentence'
tokens = nltk.word_tokenize(text)
tokens

In [None]:
text = 'This is a Demo Sentence. This is another sentence'
tokens = nltk.sent_tokenize(text)
tokens

In [None]:
from nltk.tokenize import RegexpTokenizer

text = 'This% is a #!!@ Sentence full of punctuation marks :-) '
regexpt = RegexpTokenizer(r'[a-zA-Z0-9]+')
tokens = regexpt.tokenize(text)
tokens

In [None]:
nltk.download('stopwords')

from nltk.corpus import stopwords

text = 'This is a Demo Sentence. This is another sentence'
eng_sw = stopwords.words('english')
tokens = nltk.word_tokenize(text)
clean_tokens = [word for word in tokens if word not in eng_sw]
clean_tokens

In [4]:
from nltk.stem import SnowballStemmer

text = 'This operation operates for the operator curiosity. A decisive decision'
stemmer = SnowballStemmer('english')
tokens = nltk.word_tokenize(text)
stemmed_tokens = [stemmer.stem(word) for word in tokens]
print(stemmed_tokens)

['this', 'oper', 'oper', 'for', 'the', 'oper', 'curios', '.', 'a', 'decis', 'decis']


In [9]:
nltk.download('omw-1.4')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

text = 'A verb: I split, it splits. Splitted verbs.'
tokens = nltk.word_tokenize(text)
lmtzr = WordNetLemmatizer()

lemma_tokens = [lmtzr.lemmatize(word) for word in tokens]
print(lemma_tokens)


['A', 'verb', ':', 'I', 'split', ',', 'it', 'split', '.', 'Splitted', 'verb', '.']


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nelli\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nelli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Use Text on the Network

In [None]:
from urllib import request

In [None]:
url = "https://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')


In [None]:
url = "https://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:120]


In [None]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, "lxml").get_text()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)


# Sentimental Analysis

In [None]:
nltk.download('movie_reviews')

In [None]:
import random
reviews = nltk.corpus.movie_reviews
documents = [(list(reviews.words(fileid)), category)
                for category in reviews.categories()
          for fileid in reviews.fileids(category)]
random.shuffle(documents)


In [None]:
first_review = ' '.join(documents[0][0])
print(first_review)


In [None]:
documents[0][1]

In [None]:
all_words = nltk.FreqDist(w.lower() for w in reviews.words())
word_features = list(all_words)


In [None]:
def document_features(document, word_features):
     document_words = set(document)
     features = {}
     for word in word_features:
         features ['{}'.format(word)] = (word in document_words)
     return features


In [None]:
featuresets = [(document_features(d,word_features), c) for (d,c) in documents]
len(featuresets)

In [None]:
train_set, est_set = featuresets[1500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)



In [None]:
classifier.show_most_informative_features(10)