In [88]:
from collections import Counter
import datetime
import nltk
from nltk.text import TextCollection
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
from scipy import spatial
import seaborn as sns
from matplotlib import pyplot as plt
import string
import re

from typing import List
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

from src import utils
from src.data import process_transcripts
from src.data.make_corpus import Corpus, Vocabulary


ROOT_DIR = utils.get_project_root()
DATA_DIR = Path.joinpath(ROOT_DIR, 'data')
DATA_RAW_DIR = Path.joinpath(DATA_DIR, 'raw/cs-410')
INTERMEDATE_DATA_DIR = Path.joinpath(DATA_DIR, 'intermediate')

[nltk_data] Downloading package punkt to /home/bxjxrx7/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bxjxrx7/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
with open(Path.joinpath(INTERMEDATE_DATA_DIR, 'transcripts.pkl'), 'rb') as f:
        transcripts = pickle.load(f)

all_segments = [] 
for transcript_segments in transcripts.values():
        all_segments.extend(transcript_segments)
vocab = Vocabulary(all_segments, remove_stop_words=True)

In [90]:
with open(Path.joinpath(INTERMEDATE_DATA_DIR, 'transcripts.pkl'), 'rb') as f:
        transcripts = pickle.load(f)

all_text = [] 

for transcript_segments in transcripts.values():
    for segment in transcript_segments:
        all_text.append(segment.text)
all_text = ' '.join(all_text).lower().split('.')

In [91]:
token_list = []
for sentence in all_text:
    sentence = sentence.strip().translate(str.maketrans('', '', string.punctuation))
    if 'bits' in sentence:
        print(sentence)
    sentence = re.sub(r"\b\d+[s]{0,1}\b", 'NUMBER', sentence)     # replace numbers with this token
    
    tokens = nltk.word_tokenize(sentence)
    tokens = [word for word in tokens if word not in stop_words]
    token_list.append(tokens)

token_list

and so how can we leverage the skewed distributions of values to compress these values well in general we will use few bits to encode those frequent words at the cost of using longer bit string code those rare values
therefore we can use fewer bits for the small but highly frequent integers and thats cost of using more bits for larger integers
we can save on average even though sometimes when we see a large number we have to use a lot of bits
so now you can imagine how many bits do we have to use for a large number like 100 so how many bits do you have to use exactly for a number like 100 well exactly we have to use 100 bits
so its the same number of bits as the value of this number
imagine if you occasionally see a number like 1000 you have to use 1000 bits
now how do you decode this code now since these are variable length encoding methods you cant just count how many bits and then just stop
you cant say 8bits or 32bits then you will start another code
and its easy to show that for t

[['sound', 'lecture', 'natural', 'language', 'content', 'analysis'],
 ['natural',
  'language',
  'content',
  'analysis',
  'foundation',
  'text',
  'mining'],
 ['going', 'first', 'talk'],
 ['particular',
  'natural',
  'language',
  'processing',
  'factor',
  'present',
  'text',
  'data'],
 ['determines', 'algorithms', 'used', 'analyze', 'mine', 'text', 'data'],
 ['going',
  'take',
  'look',
  'basic',
  'concepts',
  'natural',
  'language',
  'first'],
 ['im',
  'going',
  'explain',
  'concepts',
  'using',
  'similar',
  'example',
  'youve',
  'seen'],
 ['dog', 'chasing', 'boy', 'playground'],
 ['simple', 'sentence'],
 ['read', 'sentence', 'dont', 'think', 'get', 'meaning'],
 ['computer', 'understand', 'sentence', 'computer', 'go', 'several', 'steps'],
 ['first',
  'computer',
  'needs',
  'know',
  'words',
  'segment',
  'words',
  'english'],
 ['easy', 'look', 'space'],
 ['computer',
  'need',
  'know',
  'categories',
  'words',
  'syntactical',
  'categories'],
 ['examp

In [92]:
def count_n_grams(tokenized_sentences: List[List[str]], n: int) -> Counter:

    n_grams_list = []
    for tokens in tokenized_sentences:
        n_grams_list.append(list(nltk.ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))

    merged_n_grams = []
    for sentence in n_grams_list:
        for n_gram in sentence:
            s ='_'.join(n_gram)
            if '<s>' not in s and '</s>' not in s:
                merged_n_grams.append(s)

    return Counter(merged_n_grams)

In [93]:
count_4_grams = count_n_grams(token_list, n=4)
count_4_grams.most_common(100)

[('use_maximum_likelihood_estimator', 9),
 ('natural_language_processing_techniques', 8),
 ('simplest_vector_space_model', 8),
 ('like_vector_space_model', 8),
 ('lecture_going_continue_discussing', 8),
 ('opinion_mining_sentiment_analysis', 7),
 ('lecture_going_continue_talking', 7),
 ('query_vector_document_vector', 6),
 ('lecture_going_continue_discussion', 6),
 ('natural_language_content_analysis', 5),
 ('text_retrieval_text_mining', 5),
 ('lecture_going_talk_text', 5),
 ('distribution_used_generate_document', 5),
 ('take_away_probability_mass', 5),
 ('natural_language_processing_difficult', 4),
 ('matching_one_frequent_term', 4),
 ('next_lecture_going_talk', 4),
 ('text_data_actionable_knowledge', 4),
 ('nontext_data_text_data', 4),
 ('discussion_vector_space_model', 4),
 ('unique_query_terms_matched', 4),
 ('query_terms_matched_document', 4),
 ('instantiate_vector_space_model', 4),
 ('weighting_document_length_normalization', 4),
 ('lets_take_look_specific', 4),
 ('talked_push_ve

In [94]:
count_3_grams = count_n_grams(token_list, n=3)
count_3_grams.most_common(200)

[('vector_space_model', 81),
 ('natural_language_processing', 32),
 ('lets_take_look', 31),
 ('would_allow_us', 31),
 ('would_give_us', 30),
 ('maximum_likelihood_estimate', 26),
 ('lecture_going_talk', 25),
 ('background_language_model', 22),
 ('lecture_going_continue', 21),
 ('maximum_likelihood_estimator', 19),
 ('many_different_ways', 17),
 ('distribution_used_generate', 17),
 ('contextual_text_mining', 16),
 ('would_look_like', 12),
 ('particular_going_talk', 12),
 ('collection_language_model', 12),
 ('sum_query_words', 12),
 ('topic_mining_analysis', 11),
 ('use_maximum_likelihood', 11),
 ('unigram_language_model', 11),
 ('topic_word_distribution', 11),
 ('would_help_us', 10),
 ('lets_first_look', 10),
 ('count_word_document', 10),
 ('lecture_continue_discussion', 10),
 ('system_b_better', 10),
 ('documents_match_term', 9),
 ('lot_text_data', 9),
 ('gives_us_probability', 9),
 ('NUMBER_NUMBER_NUMBER', 9),
 ('document_language_model', 9),
 ('probability_word_given', 9),
 ('unigram

In [95]:
count_2_grams = count_n_grams(token_list, n=2)
count_2_grams.most_common(200)

[('text_data', 229),
 ('language_model', 111),
 ('text_mining', 106),
 ('vector_space', 97),
 ('space_model', 82),
 ('text_retrieval', 81),
 ('going_talk', 80),
 ('word_distribution', 74),
 ('NUMBER_NUMBER', 73),
 ('help_us', 71),
 ('relevant_documents', 67),
 ('lecture_going', 66),
 ('search_engine', 65),
 ('give_us', 61),
 ('lets_say', 60),
 ('solve_problem', 59),
 ('natural_language', 58),
 ('mixture_model', 57),
 ('theta_sub', 57),
 ('different_ways', 56),
 ('likelihood_function', 54),
 ('maximum_likelihood', 53),
 ('take_look', 52),
 ('allow_us', 52),
 ('ranking_function', 52),
 ('training_data', 51),
 ('probability_word', 49),
 ('lets_look', 47),
 ('topic_model', 46),
 ('text_categorization', 46),
 ('would_allow', 45),
 ('sound_lecture', 44),
 ('would_like', 44),
 ('going_use', 44),
 ('gives_us', 43),
 ('search_engines', 43),
 ('machine_learning', 41),
 ('allows_us', 41),
 ('looks_like', 40),
 ('time_series', 40),
 ('would_give', 39),
 ('inverted_index', 38),
 ('also_see', 37),
 

In [96]:
vocab = set()
for tokens in token_list:
    vocab.update(tokens)
vocab = sorted(list(vocab))
vocab

['1k',
 '32bits',
 '8bits',
 'NUMBER',
 'aa',
 'aand',
 'ab',
 'ability',
 'able',
 'abortion',
 'abounded',
 'absence',
 'absences',
 'absent',
 'absents',
 'absolute',
 'absolutely',
 'abstract',
 'abstracts',
 'accelerate',
 'accept',
 'accepted',
 'access',
 'accessed',
 'accidental',
 'accommodate',
 'accompanying',
 'according',
 'accordingly',
 'account',
 'accounting',
 'accounts',
 'accumulate',
 'accumulated',
 'accumulative',
 'accumulator',
 'accumulators',
 'accuracy',
 'accurate',
 'accurately',
 'achievable',
 'achieve',
 'achieved',
 'achieves',
 'achieving',
 'acl',
 'acoustic',
 'acquire',
 'acquired',
 'acquisition',
 'across',
 'act',
 'acted',
 'action',
 'actionable',
 'actions',
 'active',
 'actively',
 'activities',
 'activity',
 'actor',
 'acts',
 'actual',
 'actually',
 'ad',
 'adapt',
 'adaptation',
 'adapted',
 'adaptive',
 'add',
 'added',
 'adding',
 'addition',
 'additional',
 'additives',
 'address',
 'addressed',
 'addresses',
 'addressing',
 'adds',
 '

In [97]:
print(len(vocab))

4652


In [99]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
vocab_stemmed = sorted(list(set(stemmer.stem(word) for word in vocab)))
vocab_stemmed

['1k',
 '32bit',
 '8bit',
 'aa',
 'aand',
 'ab',
 'abil',
 'abl',
 'abort',
 'abound',
 'absenc',
 'absent',
 'absolut',
 'abstract',
 'acceler',
 'accept',
 'access',
 'accident',
 'accommod',
 'accompani',
 'accord',
 'accordingli',
 'account',
 'accumul',
 'accur',
 'accuraci',
 'achiev',
 'acl',
 'acoust',
 'acquir',
 'acquisit',
 'across',
 'act',
 'action',
 'activ',
 'actor',
 'actual',
 'ad',
 'adapt',
 'add',
 'addit',
 'address',
 'adequ',
 'adjac',
 'adject',
 'adjust',
 'adopt',
 'adult',
 'advanc',
 'advantag',
 'advertis',
 'advic',
 'affect',
 'affili',
 'afford',
 'afghanistan',
 'afternoon',
 'age',
 'agenc',
 'agent',
 'agglom',
 'aggreg',
 'aggress',
 'ago',
 'agre',
 'agreement',
 'ahead',
 'aid',
 'aim',
 'air',
 'airlin',
 'airport',
 'alcohol',
 'alert',
 'algebra',
 'algorithm',
 'align',
 'allevi',
 'allianc',
 'alloc',
 'allow',
 'almost',
 'alon',
 'along',
 'alpha',
 'alreadi',
 'also',
 'altern',
 'although',
 'altogeth',
 'alway',
 'ama',
 'amaz',
 'amazon

In [100]:
len(vocab_stemmed)

2855