## Creating n-grams from text

#### Users MUST shutdown kernels!!!

In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)

In [2]:
import pandas as pd
import nltk as nltk

In [3]:
book = '/project/msca/kadochnikov/data/books/book_11-0.txt'
book_short = '/project/msca/kadochnikov/data/books/3boat10_short.txt'
book_out = '/project/msca/kadochnikov/books/book_11-0_out.txt'

### Find the ten most common words in book - what is this book about?

In [4]:
import re
from collections import Counter
words = re.findall(r'\w+', open(book).read())
Counter(words).most_common(10)

[('the', 1686),
 ('and', 869),
 ('to', 799),
 ('a', 672),
 ('of', 606),
 ('I', 543),
 ('it', 540),
 ('she', 509),
 ('said', 456),
 ('in', 414)]

In [5]:
#lowercasing the words
import re
from collections import Counter
words = re.findall(r'\w+', open(book).read().lower())
Counter(words).most_common(10)

[('the', 1818),
 ('and', 940),
 ('to', 809),
 ('a', 690),
 ('of', 631),
 ('it', 610),
 ('she', 553),
 ('i', 543),
 ('you', 481),
 ('said', 462)]

### Concept of n-grams

In [6]:
sentence = 'quick brown fox jumps over the lazy dog'
n = 5
kgrams = nltk.ngrams(sentence.split(), n)
for grams in kgrams:
    print (grams)

('quick', 'brown', 'fox', 'jumps', 'over')
('brown', 'fox', 'jumps', 'over', 'the')
('fox', 'jumps', 'over', 'the', 'lazy')
('jumps', 'over', 'the', 'lazy', 'dog')


In [7]:
#applying NLTK
f = open(book)
raw = f.read()

words = nltk.tokenize.word_tokenize(raw)
fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 3722 samples and 38329 outcomes>


[(',', 2565),
 ('’', 1769),
 ('the', 1681),
 ('‘', 1116),
 ('.', 901),
 ('and', 863),
 ('to', 794),
 ('a', 671),
 ('of', 603),
 ('I', 543)]

In [8]:
fdist_df = pd.DataFrame(fdist.most_common(),
                    columns=['Word', 'Frequency'])
fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,",",2565
1,’,1769
2,the,1681
3,‘,1116
4,.,901
5,and,863
6,to,794
7,a,671
8,of,603
9,I,543


### Exploring bi-grams and tri-grams

In [9]:
f = open(book)
raw = f.read()

tokens = nltk.word_tokenize(raw)

#Create your bigrams or trigrams
bgs = nltk.bigrams(tokens)
tgs = nltk.trigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist_2 = nltk.FreqDist(bgs)
fdist_3 = nltk.FreqDist(tgs)

fdist_2_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_3_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

In [10]:
print(fdist_2_df.shape)

fdist_2_df.head(n=10)

(17292, 2)


Unnamed: 0,Word,Frequency
0,"(,, and)",456
1,"(., ‘)",427
2,"(,, ’)",397
3,"(’, said)",329
4,"(!, ’)",282
5,"(’, ‘)",241
6,"(’, t)",216
7,"(said, the)",206
8,"(’, s)",204
9,"(,, ‘)",199


In [11]:
print(fdist_3_df.shape)

fdist_3_df.head(n=10)

(28479, 2)


Unnamed: 0,Word,Frequency
0,"(,, ’, said)",215
1,"(’, said, the)",203
2,"(’, said, Alice)",115
3,"(., ‘, I)",69
4,"(!, ’, said)",65
5,"(,, ’, the)",59
6,"(I, ’, m)",57
7,"(*, *, *)",54
8,"(don, ’, t)",51
9,"(Alice, ., ‘)",50


### Creting custom n-grams

In [12]:
#this way we are not limited to bigrams and trigrams only.
n = 5
ngrams = nltk.ngrams(raw.split(), n)


fdist_n = nltk.FreqDist(ngrams)

fdist_n_df = pd.DataFrame(fdist_n.most_common(),
                    columns=['Word', 'Frequency'])

fdist_n_df.head(10)

Unnamed: 0,Word,Frequency
0,"(*, *, *, *, *)",48
1,"(the, Project, Gutenberg, Literary, Archive)",11
2,"(Project, Gutenberg, Literary, Archive, Founda...",9
3,"(to, the, Project, Gutenberg, Literary)",6
4,"(in, a, tone, of, great)",4
5,"(Will, you,, won’t, you,, will)",4
6,"(you,, won’t, you,, will, you,)",4
7,"(won’t, you,, will, you,, won’t)",4
8,"(you,, will, you,, won’t, you,)",4
9,"(the, terms, of, this, agreement)",4


In [13]:
#this way we are not limited to bigrams and trigrams only.
n = 10
ngrams = nltk.ngrams(raw.split(), n)


fdist_n = nltk.FreqDist(ngrams)

fdist_n_df = pd.DataFrame(fdist_n.most_common(),
                    columns=['Word', 'Frequency'])

fdist_n_df.head(10)

Unnamed: 0,Word,Frequency
0,"(*, *, *, *, *, *, *, *, *, *)",33
1,"(join, the, dance?, Will, you,, won’t, you,, w...",3
2,"(the, dance?, Will, you,, won’t, you,, will, y...",3
3,"(This, eBook, is, for, the, use, of, anyone, a...",2
4,"(eBook, is, for, the, use, of, anyone, anywher...",2
5,"(is, for, the, use, of, anyone, anywhere, at, ...",2
6,"(for, the, use, of, anyone, anywhere, at, no, ...",2
7,"(the, use, of, anyone, anywhere, at, no, cost,...",2
8,"(use, of, anyone, anywhere, at, no, cost, and,...",2
9,"(of, anyone, anywhere, at, no, cost, and, with...",2


### Saving n-grams

In [14]:
f2 = open(book_out, "w")


#Create your n-grams
n = 6
sixgrams = nltk.ngrams(raw.split(), n)

#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(sixgrams)
for k,v in fdist.items():
    #print k,v
    f2.write(str(k)+"\t"+str(v)+"\n")
    
f2.close()

FileNotFoundError: [Errno 2] No such file or directory: '/project/msca/kadochnikov/books/book_11-0_out.txt'

In [None]:
!ls -l /home/kadochnikov/data/books/book_11-0_out*

## While we can also apply dictionary-based clean-up algorithms, they are not going to work on Big Data applications

In [None]:
#from nltk.corpus import stopwords

#default_stopwords = set(nltk.corpus.stopwords.words('english'))

f = open(book)
raw = f.read()

words = nltk.tokenize.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]

# Remove numbers
#words = [word for word in words if not word.isnumeric()]

# Remove punctuation
words = [word for word in words if word.isalpha()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]

# Remove stopwords
words = [word for word in words if word not in stopwords]

fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)