In [4]:
"""This file is used for some transformations"""

import torch
import numpy as np

class Toindex(object):
    def __init__(self,lookup_table):
        assert isinstance(lookup_table,list)
        self.lookup_table = lookup_table
        
    def __call__(self,sample):
        X_idx,y_idx = [],[]
        X,y = sample[0],sample[1]
        for sentence in X:
            word_idx = []
            for word in sentence:
                try:
                    word_idx.append(self.lookup_table.index(word))
                except:
                    word_idx.append(0)
            X_idx.append(word_idx)
            
        for sentence in y:
            word_idx = []
            for word in sentence:
                try:
                    word_idx.append(self.lookup_table.index(word))
                except:
                    word_idx.append(0)
            y_idx.append(word_idx)
            
        return np.array(X_idx),np.array(y_idx)
    
class ToOnehot(object):
    def __init__(self,lookup_table):
        assert isinstance(lookup_table,list)
        self.lookup_table = lookup_table
        
    def __call__(self,sample):
        vec = []
        X,y = sample[0],sample[1]
        for sentence in X:
            idx_vec = []
            for idx in sentence:
                zero = np.zeros(len(self.lookup_table))
                zero[idx] = 1
                idx_vec.append(zero)
            vec.append(idx_vec)
        return np.asarray(vec),np.asarray(y)
    
class ToTensor(object):
    def __call__(self,samples):
        X,y = samples[0],samples[1]
        assert isinstance(X,np.ndarray)
        X = torch.tensor(X)
        y = torch.tensor(y)
        return X,y
    
class CBOW_sum_up(object):
    def __call__(self,samples):
        X,y = samples[0],samples[1]
        return torch.mean(X,1).squeeze(),y
        
        
class Compose(object):
    def __init__(self,transforms):
        assert isinstance(transforms,list)
        self.transforms = transforms
    def __call__(self,samples):
        for transform in self.transforms:
            samples = transform(samples)
        return samples
    

    

In [9]:
"""This block is used to generate the dataset from the *.csv file"""
import numpy as np
import pandas as pd
import torch


class preprocessing():
    def __init__(self,vocab_size=250):
        self.vocab_size = vocab_size
        self.dict = {}
        self.vocab = ['UNK']
        self.exception = ['.',',','!','?','/','--','']
        self.read_csv()
        self.build_vocab()
    
        
    def read_csv(self):
        df = pd.read_csv('simpsons_dataset.csv')['spoken_words']
        self.data = []
        for i in range(df.shape[0]):

            if type(df[i]) != float:
                
                self.data.append(self._fliter(df[i]))
                
        
    def _fliter(self,spoken):
        spec_char = [',','.','?','!']
        for char in spec_char:
            spoken = spoken.replace(char,' '+char)
        spoken = spoken.lower()
        return spoken.split(' ')
    
    def _build_dict(self,vocab):
        if vocab not in self.dict.keys():
            self.dict[vocab] = 1
        else:
            self.dict[vocab]+= 1
            
    def build_vocab(self):
        for sentence in self.data:
            for vocab in sentence:
                self._build_dict(vocab)
                
        freq_vocab = sorted(list(self.dict.items()),key = lambda x: x[1],reverse=True)
        count = 0
        for vocab in freq_vocab:
            if vocab[0] not in self.exception:
                self.vocab.append(vocab[0])
                count += 1
        self.vocab = self.vocab[:self.vocab_size+1]
        
        
class CBOW_Dataset(torch.utils.data.Dataset):
    def __init__(self,preprocessing,window_size=1,transform=None):
        assert isinstance(preprocessing, object)
        self.window_size = window_size
        self.data = preprocessing.data
        self.freq = preprocessing.dict
        self.vocab = preprocessing.vocab
        self.transform = transform
        self.Generate()
        
        
    def select(self,sentence):
        train_data = []
        for target in sentence:
            count = 0
            if target in self.vocab:
                idx = sentence.index(target)
                if idx >= self.window_size and idx+self.window_size+1<=len(sentence):
                    X1 = sentence[idx-self.window_size:idx]
                    X2 = sentence[idx+1:idx+self.window_size+1]
                    X1.extend(X2)
                    count = sum(list(map(lambda x: 1 if x not in self.vocab else 0,X1)))
                    if count < self.window_size:
                        X1.append(target)
                        train_data.append(X1)
        return train_data
                
    def Generate(self):
        self.traindata = []
        for sentence in self.data:
                self.traindata.extend(self.select(sentence))
                
    def __len__(self):
        return len(self.traindata)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.to_list()
            
        data = self.traindata[index]            
        samples = [data[:-1]],[[data[-1]]]
        
        if self.transform:
            samples = self.transform(samples)
        
        return samples
    
    
class Skip_Gram_Dataset(torch.utils.data.Dataset):
    def __init__(self,preprocessing,window_size=1,transform=None):
        assert isinstance(preprocessing, object)
        self.window_size = window_size
        self.data = preprocessing.data
        self.freq = preprocessing.dict
        self.vocab = preprocessing.vocab
        self.transform = transform
        self.Generate()
        
        
    def select(self,sentence):
        train_data = []
        for target in sentence:
            count = 0
            if target in self.vocab:
                idx = sentence.index(target)
                if idx >= self.window_size and idx+self.window_size+1<=len(sentence):
                    X1 = sentence[idx-self.window_size:idx]
                    X2 = sentence[idx+1:idx+self.window_size+1]
                    X1.extend(X2)
                    for cnt_word in X1:
                        if cnt_word in self.vocab:
                            train_data.append([target,cnt_word])
                            
        return train_data
                
    def Generate(self):
        self.traindata = []
        for sentence in self.data:
                self.traindata.extend(self.select(sentence))
                
    def __len__(self):
        return len(self.traindata)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.to_list()
            
        data = self.traindata[index]            
        samples = [[data[0]]],[[data[-1]]]
        
        if self.transform:
            samples = self.transform(samples)
        
        return samples
    


In [10]:
"""Here is the definition of the model"""
import torch
import torch.nn as nn

class Embedding(nn.Module):
    def __init__(self,v_dim,hidden_size):
        super(Embedding,self).__init__()
        self.fc1 = nn.Linear(v_dim, hidden_size,bias=False)
        self.out = nn.Linear(hidden_size, v_dim)
        self.act = nn.Softmax()
    def forward(self,x):
        x = self.fc1(x)
        x = self.out(x)
        out = self.act(x)
        return out


In [11]:
from torch.utils.data import DataLoader

vocab_parser = Dataset.preprocessing(vocab_size=350)

dataset = CBOW_Dataset(vocab_parser,
                       window_size=1,
                       transform=Compose([Toindex(vocab_parser.vocab),
                                          ToOnehot(vocab_parser.vocab),
                                          ToTensor(),
                                          CBOW_sum_up()]))

train_loader = DataLoader(dataset,batch_size=50000,shuffle=True)

model = Embedding(351,128)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-1)

num_epochs = 5
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (X,y) in enumerate(train_loader):
        X = X.type(torch.FloatTensor)
        y = y.type(torch.LongTensor).squeeze()
        
        outputs = model(X)
        loss = criterion(outputs,y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# torch.save(model.state_dict(),'CBOW.pt')

  


Epoch [1/5], Step [1/6] Loss: 5.8612
Epoch [1/5], Step [2/6] Loss: 5.8567
Epoch [1/5], Step [3/6] Loss: 5.7258
Epoch [1/5], Step [4/6] Loss: 5.6802
Epoch [1/5], Step [5/6] Loss: 5.6615
Epoch [1/5], Step [6/6] Loss: 5.6496
Epoch [2/5], Step [1/6] Loss: 5.6506
Epoch [2/5], Step [2/6] Loss: 5.6495
Epoch [2/5], Step [3/6] Loss: 5.6464
Epoch [2/5], Step [4/6] Loss: 5.6414
Epoch [2/5], Step [5/6] Loss: 5.6328
Epoch [2/5], Step [6/6] Loss: 5.6221
Epoch [3/5], Step [1/6] Loss: 5.6340
Epoch [3/5], Step [2/6] Loss: 5.6315
Epoch [3/5], Step [3/6] Loss: 5.6319
Epoch [3/5], Step [4/6] Loss: 5.6319
Epoch [3/5], Step [5/6] Loss: 5.6308
Epoch [3/5], Step [6/6] Loss: 5.6434
Epoch [4/5], Step [1/6] Loss: 5.6324
Epoch [4/5], Step [2/6] Loss: 5.6312
Epoch [4/5], Step [3/6] Loss: 5.6294
Epoch [4/5], Step [4/6] Loss: 5.6309
Epoch [4/5], Step [5/6] Loss: 5.6299
Epoch [4/5], Step [6/6] Loss: 5.6246
Epoch [5/5], Step [1/6] Loss: 5.6283
Epoch [5/5], Step [2/6] Loss: 5.6279
Epoch [5/5], Step [3/6] Loss: 5.6283
E

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader


vocab_parser = Dataset.preprocessing(vocab_size=350)

dataset = Skip_Gram_Dataset(vocab_parser,
                            window_size=2,
                            transform=Compose([Toindex(vocab_parser.vocab),
                                               ToOnehot(vocab_parser.vocab),
                                               ToTensor()]))

train_loader = DataLoader(dataset,batch_size=80000,shuffle=True)

model = Embedding(351,128)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-1)

num_epochs = 5
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (X,y) in enumerate(train_loader):
        X = X.type(torch.FloatTensor).squeeze()
        y = y.type(torch.LongTensor).squeeze()
        # print(X.shape)
        # Forward pass
        outputs = model(X)
        # print(outputs.shape)
        loss = criterion(outputs,y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

#torch.save(model.state_dict(),'Skip_Gram.pt')



  


Epoch [1/5], Step [1/20] Loss: 5.8664
Epoch [1/5], Step [2/20] Loss: 5.8566
Epoch [1/5], Step [3/20] Loss: 5.8028
Epoch [1/5], Step [4/20] Loss: 5.7847
Epoch [1/5], Step [5/20] Loss: 5.7812
Epoch [1/5], Step [6/20] Loss: 5.7806
Epoch [1/5], Step [7/20] Loss: 5.7797
Epoch [1/5], Step [8/20] Loss: 5.7814
Epoch [1/5], Step [9/20] Loss: 5.7797
Epoch [1/5], Step [10/20] Loss: 5.7806
Epoch [1/5], Step [11/20] Loss: 5.7791
Epoch [1/5], Step [12/20] Loss: 5.7812
Epoch [1/5], Step [13/20] Loss: 5.7804
Epoch [1/5], Step [14/20] Loss: 5.7787
Epoch [1/5], Step [15/20] Loss: 5.7811
Epoch [1/5], Step [16/20] Loss: 5.7807
Epoch [1/5], Step [17/20] Loss: 5.7795
Epoch [1/5], Step [18/20] Loss: 5.7797
Epoch [1/5], Step [19/20] Loss: 5.7802
Epoch [1/5], Step [20/20] Loss: 5.7778
Epoch [2/5], Step [1/20] Loss: 5.7804
Epoch [2/5], Step [2/20] Loss: 5.7784
Epoch [2/5], Step [3/20] Loss: 5.7784
Epoch [2/5], Step [4/20] Loss: 5.7807
Epoch [2/5], Step [5/20] Loss: 5.7814
Epoch [2/5], Step [6/20] Loss: 5.7806
E

In [17]:
import numpy as np

v_dim = 350
vocab_parser = Dataset.preprocessing(v_dim)

model = Embedding(v_dim+1,128)
model.load_state_dict(torch.load('CBOW.pt'))


vocab = vocab_parser.vocab
print("Embedding trained by CBOW")

test = 'homer'
print("Keyword is ",test)
index = vocab.index(test)

embedding = list(model.fc1.parameters())[0].t()

word = embedding[index]
dis = []
for i in embedding:
    dis.append(torch.sqrt(torch.sum((word-i)**2)))
dis = np.argsort(dis)[1:20]
print("The most relevant vocabulary:")

for i in dis:
    print(vocab[i])

Embedding trained by CBOW
Keyword is  homer
The most relevant vocabulary:
lisa
moe
bart
,"
marge
wow
milhouse
huh
UNK
whoa
maggie
ow
dad
c'mon
mother
me
burns
sir
flanders


In [18]:
import numpy as np

v_dim = 350
vocab_parser = Dataset.preprocessing(v_dim)

model = Embedding(v_dim+1,128)
model.load_state_dict(torch.load('Skip_Gram.pt'))


vocab = vocab_parser.vocab

print("Embedding trained by Skip_Gram")
test = 'homer'
print("Keyword is ",test)
index = vocab.index(test)

embedding = list(model.fc1.parameters())[0].t()

word = embedding[index]
dis = []
for i in embedding:
    dis.append(torch.sqrt(torch.sum((word-i)**2)))
dis = np.argsort(dis)[1:20]
print("The most relevant vocabulary:")

for i in dis:
    print(vocab[i])
    
    

Embedding trained by Skip_Gram
Keyword is  homer
The most relevant vocabulary:
mom
honey
marge
three
,"
maggie
they
'em
every
moe
everything
two
cool
again
hear
that
okay
remember
sure
