In [None]:
from collections import Counter
import nltk
from pathlib import Path
import pickle
import re
import string
from typing import List

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

from src import utils

ROOT_DIR = utils.get_project_root()
DATA_DIR = Path.joinpath(ROOT_DIR, 'data')
DATA_RAW_DIR = Path.joinpath(DATA_DIR, 'raw/cs-410')
INTERMEDATE_DATA_DIR = Path.joinpath(DATA_DIR, 'intermediate')

### Punctuation used in string module

In [None]:
string.punctuation

In [None]:
# with open(Path.joinpath(INTERMEDATE_DATA_DIR, 'transcripts.pkl'), 'rb') as f:
#         transcripts = pickle.load(f)

# all_segments = [] 
# for transcript_segments in transcripts.values():
#         all_segments.extend(transcript_segments)
# vocab = Vocabulary(all_segments, remove_stop_words=True, combine_ngrams=False, stem_words=False)

### Load processed transcripts and add text to list

In [None]:
with open(Path.joinpath(INTERMEDATE_DATA_DIR, 'transcripts.pkl'), 'rb') as f:
        transcripts = pickle.load(f)

all_text = [] 

for transcript_segments in transcripts.values():
    for segment in transcript_segments:
        all_text.append(segment.text)
all_text = ' '.join(all_text).lower().split('.')

In [None]:
token_list = []
for sentence in all_text:
    sentence = sentence.strip().translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    sentence = re.sub(r"\b\d+[s]{0,1}\b", 'NUMBER', sentence)                        # replace numbers with this token
    
    # tokenize sentence
    tokens = nltk.word_tokenize(sentence)
    tokens = [word for word in tokens if word not in stop_words]
    token_list.append(tokens)

token_list[:3]

## Count Most Common N-Grams

In [None]:
def count_n_grams(tokenized_sentences: List[List[str]], n: int) -> Counter:
    '''
    count all n-grams in TIS corpus

    Input:
        - tokenized_sentences: List[List[str]] - list of tokenized sentences
        - n: int - size of n-gram
    
    Return: Counter - count of all n-grams
    '''
    # enumerate all n-grams
    n_grams_list = []
    for tokens in tokenized_sentences:
        n_grams_list.append(list(nltk.ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))

    # combine n-grams with '_' and count
    merged_n_grams = []
    for sentence in n_grams_list:
        for n_gram in sentence:
            s ='_'.join(n_gram)
            if '<s>' not in s and '</s>' not in s:
                merged_n_grams.append(s)

    return Counter(merged_n_grams)

### Most Common 4-Grams

In [None]:
count_4_grams = count_n_grams(token_list, n=4)
count_4_grams.most_common(100)

### Most Common Tri-Grams

In [None]:
count_3_grams = count_n_grams(token_list, n=3)
count_3_grams.most_common(200)

### Most Common Bi-Grams

In [None]:
count_2_grams = count_n_grams(token_list, n=2)
count_2_grams.most_common(200)

### Create vocabulary set from tokens

In [None]:
vocab = set()
for tokens in token_list:
    vocab.update(tokens)
vocab = sorted(list(vocab))
vocab[:10]

### Vocabulary Size

In [None]:
len(vocab)

### Porter Stemmer

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
vocab_stemmed = sorted(list(set(stemmer.stem(word) for word in vocab)))
vocab_stemmed[:10]

### Size of stemmed vocabulary

In [None]:
len(vocab_stemmed)

### Effects of Porter Stemmer

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
tokens = ['probability', 'probabilistic', 'vector', 'vectors', 'word', 'words', 'computer', 'computation', 'computational']
tokens_stemmed = [stemmer.stem(word) for word in tokens]
for word, stem in zip(tokens, tokens_stemmed):
    print(word, '-', stem)