In [1]:
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import numpy as np
import random
import pickle

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [27]:
# helper function
def select_n_random(data, labels, n=100):
    '''
    Selects n random datapoints and their corresponding labels from a dataset
    '''
    assert len(data) == len(labels)

    randomlist = random.sample(range(1,len(data)),n)
    return data[randomlist], labels[randomlist]

def text_to_index_array(p_new_dic, tweets_list): 
    '''
    Mapping text data to index matrix
    '''
    new_tweets = []
    for tweet in tweets_list:
        new_tweet = []
        temp = tweet.replace("<user>", "").replace("\n", "").replace("<url>", "").split()
        for word in temp:
            try:
                new_tweet.append(p_new_dic[word]) 
            except:
                new_tweet.append(0)  # Set to 0 if not present in the vocabulary
        new_tweets.append(new_tweet)
    return np.array(new_tweets,dtype=object)   

def text_cut_to_same_long(tweets_list):
    '''
    Cut the data to the same specified length  
    '''
    data_num = len(tweets_list)
    new_ = np.zeros((data_num,maxlen)) 
    se = []
    for i in range(len(tweets_list)):
        new_[i,:] = tweets_list[i,:maxlen]        
    new_ = np.array(new_, dtype=object)
    return new_
    
def creat_wordvec_tensor(embedding_weights,X_T):
    '''
    Map the index matrix into a word vector matrix
    '''
    X_tt = np.zeros((len(X_T),maxlen,vocab_dim))
    num1 = 0
    num2 = 0
    for j in X_T:
        for i in j:
            X_tt[num1,num2,:] = embedding_weights[int(i),:]
            num2 = num2+1
        num1 = num1+1
        num2 = 0
    return X_tt

def creat_wordvec_mean_tensor(embedding_weights,X_T):
    '''
    Map the index matrix into a mean word vector matrix
    '''
    X_tt = np.zeros((len(X_T),vocab_dim))
    num1 = 0
    num2 = 0
    for j in X_T:
        temp = np.zeros((vocab_dim,))
        for i in j:
            temp += embedding_weights[int(i),:]
            num2 = num2+1
        if num2 == 0:
            X_tt[num1,:] = temp
        else:
            X_tt[num1,:] = temp/num2
        num1 = num1+1
        num2 = 0
    return X_tt

# helper function to show an image
def matplotlib_imshow(img, one_channel=False):
    if one_channel:
        img = img.mean(dim=0)
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    if one_channel:
        plt.imshow(npimg, cmap="Greys")
    else:
        plt.imshow(np.transpose(npimg, (1, 2, 0)))

In [28]:
vocab_dim = 20 
maxlen = 25  # Maximum length of text retention
   
embedding_weights = np.load("embeddings.npy") 
# Set a zero vector for words that do not appear in the vocabulary
embedding_weights = np.r_[np.zeros((1, vocab_dim)),embedding_weights]

f = open("vocab.pkl", 'rb') 
index_dict = pickle.load(f)    # index dictionary {'word': idx}

# Index each word + 1 because of the zero vector
for key, value in index_dict.items():  
    index_dict[key] = value + 1 

with open("../twitter-datasets/train_neg.txt", "r", encoding='UTF-8') as f:
    neg_data = f.readlines()
with open("../twitter-datasets/train_pos.txt", "r", encoding='UTF-8') as f:
    pos_data = f.readlines()
    
data = neg_data + pos_data

label_list = ([0] * len(neg_data) + [1] * len(pos_data))


train_x = text_to_index_array(index_dict, data)
train_y = np.array(label_list) 


In [29]:
# from torch.nn.utils.rnn import pad_sequence
# from torch.utils.data import TensorDataset, DataLoader

# # Cut the data to the same specified length 
# train_x = pad_sequence([torch.from_numpy(np.array(x)) for x in train_x],batch_first=True).float() 
# train_x = text_cut_to_same_long(train_x)

# # Index to vector
# train_x = creat_wordvec_tensor(embedding_weights,train_x)

# train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
# train_loader = DataLoader(train_data, shuffle=True, batch_size=8)

In [30]:
train_mean_x = creat_wordvec_mean_tensor(embedding_weights,train_x)

In [None]:
writer = SummaryWriter('Glove_vec_visual')

In [55]:
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

# select random images and their target indices
word, labels = select_n_random(torch.from_numpy(train_x), np.array(label_list))

In [60]:
# get the class labels for each image
classes = (':(', ':)')
class_labels = [classes[lab] for lab in labels]

# log embeddings
writer.add_embedding(word,
                    metadata=class_labels)
writer.close()



In [None]:
%tensorboard --logdir=Glove_vec_visual --host=127.0.0.1

2022-12-07 15:39:21.290725: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

TensorBoard 2.9.1 at http://127.0.0.1:6006/ (Press CTRL+C to quit)
