In [29]:
import pandas as pd
import pickle as pk
from tqdm import tqdm
import torch
import torch.nn as nn
import dictionary_corpus
from dictionary_corpus import Corpus
import numpy as np
from sklearn.decomposition import PCA, SparsePCA, KernelPCA, IncrementalPCA
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
types= ['h1','h2']
out_dir = 'contextual_embeddings/'
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)

matplotlib.use('webagg')
# load the model and the corpus
model = torch.load('hidden650_batch128_dropout0.2_lr20.0.pt',map_location=torch.device('cpu'))
corpus = Corpus('')
# print("Vocab size %d", ntokens)



In [30]:
def euclidean(x,y):
        dist = np.linalg.norm(x-y)
        return dist.round(2)
def cos(x,y):
    dist = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
    return dist.round(2)

In [42]:
def sent_feature_extraction(sent,target_word,file_type, model=model, corpus=corpus, which='h1'): 
    with torch.no_grad():
        words = sent.split()
        tokenized_sent=[]
        for word in words:
            if word in corpus.dictionary.word2idx.keys():
                wordidx = corpus.dictionary.word2idx[word]
            else:
                #print(f'{word} is to be deleted')
                wordidx = corpus.dictionary.word2idx['<unk>']
            tokenized_sent.append(wordidx)
        # initailize the hidden state of the model
        hidden = model.init_hidden(1)
        # iterate through the whole sentence
        for i, wordid in enumerate(tokenized_sent):
            word = corpus.dictionary.idx2word[wordid]
            # we are not gonna use the output
            _,hidden,emb= model(torch.as_tensor(wordid).reshape(1,1),hidden)
            # the hidden embedding is the embedding we wanted.
            # four of them correposinding to h0 h1 c0 c1
            if i ==len(tokenized_sent):
                print(f'{target} not in sentence {sent}_{file_type}, please check')
            if word == target_word:
                if which == types[0]:
                    hidden_embedding = hidden[0][0].view(650).numpy()  
                if which == types[1]:
                    hidden_embedding = hidden[0][1].view(650).numpy() # so this is actually the output 650 vector
                if which =='c0':
                    hidden_embedding = hidden[1][0].view(650).numpy() 
                if which == 'c1':
                    hidden_embedding = hidden[1][1].view(650).numpy()
                if which == 'emb':
                    hidden_embedding = emb.view(650).numpy()
                df = {'label':words[i-1]+' '+target_word+file_type,'tensor':hidden_embedding}
                return df

In [56]:
def get_target_features(file,model=model,which='h1'):
    if '_' not in file:
        print('wrong file name.')
        exit()
    # file name must be in the form of chicken_0_animal.txt
    # because 'data/' are literally 5 charactes 
    tmp = file[5:] 
    target_word = tmp.split('_')[0]
    file_type = tmp.split('_')[1]
    # get tar word features from sentences in a file containing lines of sentences
    d = {'tensors':[],'labels':[]}
    with open(file,'r') as f:
        for i,line in enumerate(f):
            if len(line)<3:
            # if line is not a sentence, continue
                continue
            else:
            # extract the features of the sentence return df
                sent_feature = sent_feature_extraction(line,target_word,file_type,which=which)
                d['tensors'].append(sent_feature['tensor'])
                d['labels'].append(sent_feature['label'])
        df = pd.DataFrame.from_dict(d)
        df['file'] = df['labels'].apply(lambda x: x.split(' ')[1][-1])
        df['prev'] = df['labels'].apply(lambda x: x.split(' ')[0])
        df['target'] = df['labels'].apply(lambda x: x.split(' ')[1][:-1])
        df = df.sort_values(by = 'target').reset_index().drop(['index'],axis=1)
        if i==0:
            raise NameError(f'{file} is a file that has no content')
    return df

In [57]:
# get_target_features('data/bank_1_ins.txt','c0')

In [58]:
def extract_all_file(directory='data',out_dir='contextual_embeddings',types=types):
# iterate over files in that directory
    for i in types:
        df = pd.DataFrame()
        for filename in tqdm(os.listdir(directory)):
            f = os.path.join(directory, filename)
            # checking if it is a file
            if os.path.isfile(f):
                delta_df = get_target_features(f,which = i)
                df = pd.concat([df,delta_df])          
            else:
                continue
        df = df.reset_index().drop(['index'],axis=1)
        with open(f'{out_dir}/all_sent_{i}','wb') as f:
            pk.dump(df,f)      

In [59]:
extract_all_file()

100%|███████████████████████████████████████████| 64/64 [00:47<00:00,  1.35it/s]
100%|███████████████████████████████████████████| 64/64 [00:55<00:00,  1.14it/s]


In [60]:
!ls contextual_embeddings

all_sent_h1  all_sent_h2


In [61]:
# with open(f'{out_dir}/all_sent_c0','rb') as f:
#     c0 = pk.load(f)
# with open(f'{out_dir}/all_sent_c1','rb') as f:
#     c1 = pk.load(f)
with open(f'{out_dir}/all_sent_{types[0]}','rb') as f:
    h1 = pk.load(f)
with open(f'{out_dir}/all_sent_{types[1]}','rb') as f:
    h2 = pk.load(f)
# pre processing


In [62]:
h1

Unnamed: 0,tensors,labels,file,prev,target
0,"[0.018521532, 0.045597155, 0.002335213, -0.058...",Looks like2,2,Looks,like
1,"[0.07884548, 0.033203643, 0.01734861, -0.12496...",yourself like2,2,yourself,like
2,"[0.0039849347, 0.016776048, 0.024410218, -0.02...",support like2,2,support,like
3,"[0.11227927, 0.03317975, 0.001646313, -0.10016...",OS like2,2,OS,like
4,"[0.007854236, 0.0059105917, 0.009929255, -0.06...",around like2,2,around,like
...,...,...,...,...,...
1164,"[0.043512758, 0.4072944, 0.035686746, 0.064607...",glass door1,1,glass,door
1165,"[0.0052277055, 0.12090539, -0.0071009444, 0.11...",a door1,1,a,door
1166,"[0.006641631, 0.0688867, 0.017595066, -0.02978...",their door1,1,their,door
1167,"[-0.00979613, 0.1276253, 0.021119574, -0.03380...",my door1,1,my,door


In [65]:
l= []
for key, g in h1.groupby(by='target'):
    l.append(key)
#     print(key=='went' or key=='line')
print(l)

['Switzerland', 'bank', 'book', 'books', 'can', 'chicken', 'computer', 'dates', 'door', 'duck', 'even', 'fish', 'form', 'lamb', 'like', 'line', 'mistakes', 'moves', 'pencil', 'potato', 'power', 'pupil', 'questions', 'rock', 'salmon', 'tears', 'thought', 'tomatoes', 'transistor', 'watch', 'went', 'wind']
