In [1]:
#importing the dataset
import csv
with open('transcripts.csv', encoding='latin-1') as TED_transcript:
    transcript_csv = csv.reader(TED_transcript)
    next(transcript_csv)
    transcripts = list(transcript_csv)
    
#Converting the csv file to a list of strings (get rid of the url) and eliminating the second column
transcript_data: list[str] = [t[0] for t in transcripts]
#Sanity check
#print(type(transcript_data)) prints out list

In [None]:
#Preprocessing the data and Corpus Creation
import re
import inflect
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

p = inflect.engine()
stop_words: set[str] = set(stopwords.words("english"))

def preprocess_text_list(text_list):
    """
    Preprocesses text in a list of string
    Args:
        text_list(list[str]): a list of string

    Returns:
        processed_text(list[list(str)])): a list of string preprocessed for concatenation into a single corpus
    """
    processed_text = []
    for text in text_list:
        #use re.sub to get rid pf punctuations and replace hem with whitespace
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
        #tokenize the words
        tokens = word_tokenize(text)
        
        #get rid fo stop words using NLTK's stop word list and convert numbers('10') into words ('ten')
        filtered  = []
        for word in tokens:
            word_lower = word.lower()
            if word_lower.isdigit():
                word_lower = p.number_to_words(word_lower)
            if word_lower not in stop_words:
                filtered.append(word_lower)
        
        #append the filtered words to the list
        processed_text.append(filtered)
    return processed_text

#creates a list of list of tokenized words
preprocessed_list_of_list: list[list[str]] = preprocess_text_list(transcript_data)


In [None]:
#Word2Vec (using Logistic regression) - Turn it into a class since there seems to be a lot going on from the textbook
import numpy as np
import random
from collections import Counter
class word2vec:
    
    #initialize the embedding dimension, window size, negative samples, learning rate, minimium count of word, and number of epochs
    def __init__(self, embedding_dim = 100, window_size = 5, negative_samples = 10, learning_rate = 0.025, min_count = 1, epoch = 10):
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.learning_rate = learning_rate
        self.min_count = min_count
        self.epochs = epoch
        
        #Training variables (originally None (set during the training corpus))
        self.vocab = None 
        self.word_to_index = None 
        self.index_to_word = None
        self.W_input = None
        self.W_output = None
        self.noise_words = None
        self.noise_prob = None
        
    def build_vocabulary(self, corpus):
        """
        Builds the vocabulary of the corpus (index words and count the frequency of the words)
        Args:
            corpus (list[list[str]]): a list of list of strings containing tokenized text of ted talk transcripts
        """
        word_count = Counter()
        #Count every word that appears in a sentence in the corpus of transcripts
        for sentence in corpus:
            for word in sentence:
                word_count[word] += 1
        
        #Get the vocab of the corpus (even though min_count is 1, still need to handle empty words)
        self.vocab = [word for word, count in word_count.items() if count >= self.min_count]
        self.vocab_size = len(self.vocab)
        #Create a mapping of words (index to a word, word to an index)
        self.word_to_index = {word: index for index, word in enumerate(self.vocab)}
        self.index_to_word = {index: word for index, word in enumerate(self.vocab)}
        self.word_freq = {word: word_count[word] for word in self.vocab}
        
    def compute_noise_distribution():
        """Compute the noise distribution for the words
        """
        return
    
    def init_embeddings(self):
        """Initializing the embedding matrices (input and output)
        """
        limit = 0.5/self.embedding_dim #Small values to avoid saturation
        self.W_input = np.random.uniform(-limit, limit, (self.vocab_size, self.embedding_dim))
        self.W_output = np.random.uniform()
        
    def train(self, corpus):
        """
        This function trains the model using the corpus created in the previous cell
        Args:
            corpus (list[list[str]]): The entire tokenized ted talk transcripts
        """
        self.build_vocabulary(corpus)
        self.compute_noise_distribution()
        return 
        
    

In [5]:
#PPMI

In [6]:
#TF-IDF

In [None]:
#Select word pairs
word_pair1 = set(("technology", "innovation"), ("learning", "fun"))
word_pair2 = set(("future", "artificial"),("intelligence", "machine"))

In [8]:
#Cosine similarity

In [9]:
#Heatmap Analysis
import sklearn

Analysis from Heatmap