In [70]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams
from string import punctuation
from nltk import FreqDist
from collections import Counter

### Train Data

In [4]:
train = '''One of the great things about being able to choose your own courses is that you get the opportunity to explore. You can try classes in a lot of different subjects, or you can dive right into a favorite subject. You may choose to begin training for a career right away. Or you may pick a major after taking some time to check out your options. Colleges offer classes and majors in subjects you’ve studied in high school — plus many more that you haven’t.'''

### CaseFold

In [12]:
train_folded = train.casefold()

### Tokenizer

In [13]:
train_token = word_tokenize(train_folded)
print(train_token)

['one', 'of', 'the', 'great', 'things', 'about', 'being', 'able', 'to', 'choose', 'your', 'own', 'courses', 'is', 'that', 'you', 'get', 'the', 'opportunity', 'to', 'explore', '.', 'you', 'can', 'try', 'classes', 'in', 'a', 'lot', 'of', 'different', 'subjects', ',', 'or', 'you', 'can', 'dive', 'right', 'into', 'a', 'favorite', 'subject', '.', 'you', 'may', 'choose', 'to', 'begin', 'training', 'for', 'a', 'career', 'right', 'away', '.', 'or', 'you', 'may', 'pick', 'a', 'major', 'after', 'taking', 'some', 'time', 'to', 'check', 'out', 'your', 'options', '.', 'colleges', 'offer', 'classes', 'and', 'majors', 'in', 'subjects', 'you', '’', 've', 'studied', 'in', 'high', 'school', '—', 'plus', 'many', 'more', 'that', 'you', 'haven', '’', 't', '.']


### StopWords Removal 

In [44]:
stop_words = set(stopwords.words('english'))

train_filtered = []
for word in train_token:
    if word not in stop_words and word not in punctuation+"’"+'"'+'"'+'-'+'''+'''+'—':
        train_filtered.append(word)
print(train_filtered)

['one', 'great', 'things', 'able', 'choose', 'courses', 'get', 'opportunity', 'explore', 'try', 'classes', 'lot', 'different', 'subjects', 'dive', 'right', 'favorite', 'subject', 'may', 'choose', 'begin', 'training', 'career', 'right', 'away', 'may', 'pick', 'major', 'taking', 'time', 'check', 'options', 'colleges', 'offer', 'classes', 'majors', 'subjects', 'studied', 'high', 'school', 'plus', 'many']


### Lemmatizer

In [11]:
lemma = WordNetLemmatizer()
train_lem = []
for word in train_filtered:
    train_lem.append(lemma.lemmatize(word))
print(train_lem)

['one', 'great', 'thing', 'able', 'choose', 'course', 'get', 'opportunity', 'explore', 'try', 'class', 'lot', 'different', 'subject', 'dive', 'right', 'favorite', 'subject', 'may', 'choose', 'begin', 'training', 'career', 'right', 'away', 'may', 'pick', 'major', 'taking', 'time', 'check', 'option', 'college', 'offer', 'class', 'major', 'subject', 'studied', 'high', 'school', 'plus', 'many']


## N-Grams

In [35]:
def generate_N_grams(words,ngram=1):
    # Generate list of copies of words with each element length less than the previous element.
    terms = [words[i:] for i in range(0,ngram)]
    
    #Combines the lists to a set of specified ngrams
    temp = zip(*terms) # * specifies that terms have multiple elements.
    
    rs = [' '.join(ngram) for ngram in temp] #Combines the set with a space to make the specified ngram list
    return rs

In [39]:
#Unigram
train_unigram = generate_N_grams(train_lem,1)
print(train_unigram)

['one', 'great', 'thing', 'able', 'choose', 'course', 'get', 'opportunity', 'explore', 'try', 'class', 'lot', 'different', 'subject', 'dive', 'right', 'favorite', 'subject', 'may', 'choose', 'begin', 'training', 'career', 'right', 'away', 'may', 'pick', 'major', 'taking', 'time', 'check', 'option', 'college', 'offer', 'class', 'major', 'subject', 'studied', 'high', 'school', 'plus', 'many']


In [40]:
#Bigram
train_bigram = generate_N_grams(train_lem,2)
print(train_unigram)

['one', 'great', 'thing', 'able', 'choose', 'course', 'get', 'opportunity', 'explore', 'try', 'class', 'lot', 'different', 'subject', 'dive', 'right', 'favorite', 'subject', 'may', 'choose', 'begin', 'training', 'career', 'right', 'away', 'may', 'pick', 'major', 'taking', 'time', 'check', 'option', 'college', 'offer', 'class', 'major', 'subject', 'studied', 'high', 'school', 'plus', 'many']


In [41]:
#Trigram
train_trigram = generate_N_grams(train_lem,3)
print(train_trigram)

['one great thing', 'great thing able', 'thing able choose', 'able choose course', 'choose course get', 'course get opportunity', 'get opportunity explore', 'opportunity explore try', 'explore try class', 'try class lot', 'class lot different', 'lot different subject', 'different subject dive', 'subject dive right', 'dive right favorite', 'right favorite subject', 'favorite subject may', 'subject may choose', 'may choose begin', 'choose begin training', 'begin training career', 'training career right', 'career right away', 'right away may', 'away may pick', 'may pick major', 'pick major taking', 'major taking time', 'taking time check', 'time check option', 'check option college', 'option college offer', 'college offer class', 'offer class major', 'class major subject', 'major subject studied', 'subject studied high', 'studied high school', 'high school plus', 'school plus many']


In [46]:
# generate frequency of n-grams
freq_bi = FreqDist(train_bigram)
freq_tri = FreqDist(train_trigram)

In [49]:
freq_tri

FreqDist({'one great thing': 1, 'great thing able': 1, 'thing able choose': 1, 'able choose course': 1, 'choose course get': 1, 'course get opportunity': 1, 'get opportunity explore': 1, 'opportunity explore try': 1, 'explore try class': 1, 'try class lot': 1, ...})

In [71]:
d = Counter()
for el in freq_tri:
    sub_el = ' '.join(el.split(' ')[:-1])
    d[sub_el] += freq_tri[el]

In [72]:
print(d)

Counter({'one great': 1, 'great thing': 1, 'thing able': 1, 'able choose': 1, 'choose course': 1, 'course get': 1, 'get opportunity': 1, 'opportunity explore': 1, 'explore try': 1, 'try class': 1, 'class lot': 1, 'lot different': 1, 'different subject': 1, 'subject dive': 1, 'dive right': 1, 'right favorite': 1, 'favorite subject': 1, 'subject may': 1, 'may choose': 1, 'choose begin': 1, 'begin training': 1, 'training career': 1, 'career right': 1, 'right away': 1, 'away may': 1, 'may pick': 1, 'pick major': 1, 'major taking': 1, 'taking time': 1, 'time check': 1, 'check option': 1, 'option college': 1, 'college offer': 1, 'offer class': 1, 'class major': 1, 'major subject': 1, 'subject studied': 1, 'studied high': 1, 'high school': 1, 'school plus': 1})
