In [1]:
!pip install bcolz

Collecting bcolz
  Downloading bcolz-1.2.1.tar.gz (1.5 MB)
[?25l[K     |▎                               | 10 kB 21.7 MB/s eta 0:00:01[K     |▌                               | 20 kB 22.0 MB/s eta 0:00:01[K     |▊                               | 30 kB 24.3 MB/s eta 0:00:01[K     |█                               | 40 kB 14.3 MB/s eta 0:00:01[K     |█▏                              | 51 kB 11.9 MB/s eta 0:00:01[K     |█▍                              | 61 kB 13.7 MB/s eta 0:00:01[K     |█▋                              | 71 kB 12.1 MB/s eta 0:00:01[K     |█▉                              | 81 kB 13.1 MB/s eta 0:00:01[K     |██                              | 92 kB 14.4 MB/s eta 0:00:01[K     |██▎                             | 102 kB 13.3 MB/s eta 0:00:01[K     |██▌                             | 112 kB 13.3 MB/s eta 0:00:01[K     |██▊                             | 122 kB 13.3 MB/s eta 0:00:01[K     |███                             | 133 kB 13.3 MB/s eta 0:00:01[K    

In [2]:
import numpy as np 
import torch
import torch.nn as nn
import torch.nn.functional as F
from google.colab import files
import bcolz
import pickle
from torch.utils.data import TensorDataset, DataLoader

import csv
import json
import pandas as pd
from collections import Counter
import spacy
import re
nlp = spacy.load('en_core_web_sm')
#stopwords = nlp.Defaults.stop_words

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
path = '/content/drive/MyDrive/505/project/'

data = pd.read_csv(f'{path}/data.csv')
print(data.shape)

(96562, 9)


In [30]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data['retweets'], train_size = 71680, random_state=42)
print(len(X_train), len(X_test))

71680 24882


In [33]:
X_train_tweet = X_train.tweet
X_test_tweet = X_test.tweet

X_train_meta = X_train.iloc[:,:-1]
X_test_meta = X_test.iloc[:,:-1]

In [49]:
# Building vocabulary for training data
def word_count(data):
  words_counter = Counter()
  for line in data:
    words =  line.split()
    for w in words:
      words_counter.update([w])
  
  words_counter_clean = {k:v for k,v in words_counter.items() if v > 1} # Removing the words that only appear once
  sorted_words = sorted(words_counter_clean, key = words_counter_clean.get, reverse = True) # Sorting the words frequency in desc order
  sorted_words = ['UNK','PAD', '<s>', '</s>' ] + sorted_words 

  return words_counter, words_counter_clean, sorted_words
  
words_counter, words_counter_clean, sorted_words = word_count(X_train_tweet)

In [50]:
# Not using slicing window padding
def padding(data, seq_len):
  sequences = []
  for line in data:
    line = f"{'<s>'} {line} {'</s>'}"
    n_token = len(line.split())
    
    if n_token >= seq_len:
      seq = line.split()[:seq_len] 
      sequences.append(" ".join(seq))

    else:
      seq = line.split()
      for i in range(seq_len - n_token):
          seq.append('PAD')
      sequences.append(" ".join(seq))
  return sequences

X_train_pad = padding(X_train_tweet, 20)
X_test_pad = padding(X_test_tweet, 20)

In [51]:
# replace the words that only appear once with UNKNOWN
def generate_sentence(data):
  sequences = []
  for line in data:
    temp = []
    words = line.split()
    for word in words:
      if word in sorted_words:
        temp.append(word)
      else:
        temp.append('UNK')
    sequences.append(" ".join(temp))
  return sequences

X_train_final = generate_sentence(X_train_pad)
X_test_final = generate_sentence(X_test_pad)

In [52]:
# Using tweets training data vocabulary

# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(sorted_words)}
idx2word = {i:o for i,o in enumerate(sorted_words)}


# convert text sequences to integer sequences
X_train_int = np.zeros((len(X_train_final), 20), dtype = int)
for i, data in enumerate(X_train_final):
  X_train_int[i] = [word2idx[w] for w in data.split()]

X_test_int = np.zeros((len(X_test_final), 20), dtype = int)
for i, data in enumerate(X_test_final):
  X_test_int[i] = [word2idx[w] for w in data.split()]



In [53]:
# convert lists to numpy arrays
X_train_int = np.array(X_train_int)
y_train_int = np.array(y_train)

X_test_int = np.array(X_test_int)
y_test_int = np.array(y_test)

In [54]:
from torch.utils.data import DataLoader, TensorDataset

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train_int), torch.from_numpy(X_train_meta.to_numpy()), torch.from_numpy(y_train_int))
test_data = TensorDataset(torch.from_numpy(X_test_int),torch.from_numpy(X_test_meta.to_numpy()), torch.from_numpy(y_test_int))

# dataloaders
batch_size = 256

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# Glove Embeddings

In [None]:
vectors = bcolz.open(f'{glove_path}/6B.100.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.100_words.pkl', 'rb'))
words += ['<UNK>', '<s>', '</s>', 'PAD']
vocab_list_glove = set(words)
new_vecs = np.random.normal(loc=0.0, scale=.6, size=(4,100) )
vectors = np.vstack((vectors, new_vecs))
word2idx = pickle.load(open(f'{glove_path}/6B.100_idx.pkl', 'rb'))
word2idx['<UNK>'] = 400000
word2idx['<s>'] = 400001
word2idx['</s>'] = 400002
word2idx['PAD'] = 400003

In [None]:
# Using glove weights
glove = {w: vectors[word2idx[w]] for w in words}
matrix_len = len(sorted_words)
weights_matrix = np.zeros((matrix_len, 100))
words_found = 0

for i, word in enumerate(sorted_words):
  try: 
    weights_matrix[i] = glove[word] # if alr in the vocab, load its pre-trained word vector.
    words_found += 1
  except KeyError:
    weights_matrix[i] = np.random.normal(scale=0.6, size=(100, ))

# Neural Net

### Retweet Network: Takes in a tweet as input, can use embedded version, and can any combination of bidirectional, LSTM, GRU, concatenates it with metadata vector, and uses a feedforward neural net with 1 hidden layer to perform a regression prediction on the retweet count. 

#### Parameter custom_embeddings is either a tuple: (weight_matrix , none_trainable), or None.
#### none_trainable is either True or False or Nothing

In [56]:
def create_emb_layer(weights_matrix, non_trainable=False):
  num_embeddings, embedding_dim = weights_matrix.shape
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
  emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
  if non_trainable:
      emb_layer.weight.requires_grad = False
  return emb_layer, num_embeddings, embedding_dim

class RetweetNet(nn.Module):
  def __init__(self, vocab_size, hidden_state_sizes, meta_data_len, output_size, embedding_dim, hidden_dim, 
                 n_layers, drop_prob=0.5, custom_embeddings = None, bidirectional = False, GRU = False):
    super().__init__()
    self.GRU_val = GRU
    self.bidirectional = bidirectional
    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
        
    if custom_embeddings is None: 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    else: 
        assert len(custom_embeddings) == 2 and isinstance(custom_embeddings, tuple), "custom embeddings must be of form: (weight_matrix, non_trainable)"
        weights_matrix, non_trainable = custom_embeddings
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, non_trainable)
        
    if GRU == False: 
        self.Gate = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
    else: 
        self.Gate = nn.GRU(embedding_dim, hidden_dim, n_layers, 
                              dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
    self.dropout = nn.Dropout(0.2)
    self.fc1 = nn.Linear(hidden_dim, hidden_state_sizes[0])
    self.relu = nn.ReLU()
        
    #hidden_state_sizes[0] is the size of the output of lstm 
    self.fc2 = nn.Linear(hidden_state_sizes[0] + meta_data_len, hidden_state_sizes[1])
        
    #hidden_state_sizes[1] is the size of the first and only hidden layer
    self.fc3 = nn.Linear(hidden_state_sizes[1], 1)

        
  def forward(self, x, meta_data, hidden):
    batch_size = x.size(0)
    x = x.long()
    embeds = self.embedding(x)
    gru_out, hidden = self.Gate(embeds, hidden)
    gru_out = gru_out.contiguous().view(-1, self.hidden_dim)
    
    out = self.dropout(gru_out)
    out = self.fc1(out)

    out = out.view(batch_size, -1, self.hidden_dim)
    out = out[:,-1, :] 

    # combine hidden state and meta_data
    ################# ################# #################
    #out = torch.cat((out, meta_data), dim = 1) #meta_data is of shape (batch_size, -1)
        
    out = self.fc2(out)
        
    # applying dropout before relu since relu already sets some neurons to 0
    out = self.dropout(out)
    out = self.relu(out)
    out = self.fc3(out)
        
    return out, hidden
    
  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    n = 1
    if self.bidirectional == True: 
      n = 2
    if self.GRU_val == False:
      return (weight.new(self.n_layers * n, batch_size, self.hidden_dim).zero_().to('cuda'),
              weight.new(self.n_layers * n, batch_size, self.hidden_dim).zero_().to('cuda'))
    else:
      return  weight.new(self.n_layers * n, batch_size, self.hidden_dim).zero_().to('cuda')


def train_retweet_predictor(model, epochs = 100, print_every = 1000, clip = 5, valid_loss_min = np.Inf, lr=0.005, batch_size = 400, device = 'cuda', GRU = False, weight_decay = 1e-5): 
  counter = 0
  model.train()
    
  criterion = nn.MSELoss()
    
  # weight decay is the l2 regularization penalty 
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
  for i in range(epochs):
    h = model.init_hidden(batch_size)
    for tweets, meta_data, labels in train_loader:
      counter += 1
      if GRU == False: 
        h = tuple([each.data for each in h])
      else:
        h = h.data
      tweets, meta_data, labels = tweets.to(device), meta_data.to(device), labels.to(device)
      model.zero_grad()
      output, h = model(tweets, meta_data, h)
      loss = criterion(output.squeeze(), labels.float())
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      optimizer.step()
      print("Epoch: {}/{}...".format(i+1, epochs),
            "Step: {}...".format(counter),
            "Loss: {:.6f}...".format(loss.item()))
            


In [57]:
vocab_size = len(sorted_words)
output_size = len(X_train_int)
embedding_dim = 100
hidden_dim = 256
n_layers = 2

net = RetweetNet(vocab_size = vocab_size, hidden_state_sizes = [256,128], meta_data_len = 0, output_size = output_size, embedding_dim = embedding_dim, hidden_dim  = hidden_dim, 
                 n_layers =n_layers, GRU = True, bidirectional = True)
net.cuda()

RetweetNet(
  (embedding): Embedding(15336, 100)
  (Gate): GRU(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)

In [58]:
train_retweet_predictor(net, epochs = 10, batch_size = 256, device = 'cuda', lr = 1e-06,GRU = True)

Epoch: 1/10... Step: 1... Loss: 857.322632...
Epoch: 1/10... Step: 2... Loss: 214.468567...
Epoch: 1/10... Step: 3... Loss: 47.391979...
Epoch: 1/10... Step: 4... Loss: 117.921532...
Epoch: 1/10... Step: 5... Loss: 88.494453...
Epoch: 1/10... Step: 6... Loss: 68.248985...
Epoch: 1/10... Step: 7... Loss: 59.379642...
Epoch: 1/10... Step: 8... Loss: 86.315208...
Epoch: 1/10... Step: 9... Loss: 38.278004...
Epoch: 1/10... Step: 10... Loss: 12552.679688...
Epoch: 1/10... Step: 11... Loss: 19.119463...
Epoch: 1/10... Step: 12... Loss: 267.201752...
Epoch: 1/10... Step: 13... Loss: 17106.505859...
Epoch: 1/10... Step: 14... Loss: 68.908417...
Epoch: 1/10... Step: 15... Loss: 129.118210...
Epoch: 1/10... Step: 16... Loss: 258.386536...
Epoch: 1/10... Step: 17... Loss: 51.409512...
Epoch: 1/10... Step: 18... Loss: 143.212906...
Epoch: 1/10... Step: 19... Loss: 5764.625000...
Epoch: 1/10... Step: 20... Loss: 41.276821...
Epoch: 1/10... Step: 21... Loss: 54.114433...
Epoch: 1/10... Step: 22... L

### Visualize RetweetNet

In [59]:
!pip install torchviz

Collecting torchviz
  Downloading torchviz-0.0.2.tar.gz (4.9 kB)
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... [?25l[?25hdone
  Created wheel for torchviz: filename=torchviz-0.0.2-py3-none-any.whl size=4150 sha256=0108fda8d4a39f8e36420853eb1fcd356cf1ac971fcd7e5b8c065ef75d9066f5
  Stored in directory: /root/.cache/pip/wheels/04/38/f5/dc4f85c3909051823df49901e72015d2d750bd26b086480ec2
Successfully built torchviz
Installing collected packages: torchviz
Successfully installed torchviz-0.0.2


In [60]:
dummy_x = torch.from_numpy(np.zeros((256,20)).astype(np.int64)).to('cuda')
dummy_meta = torch.from_numpy(np.zeros((256,8)).astype(np.int64)).to('cuda')
dummy_hidden = torch.from_numpy(np.zeros((2, 256, 256)).astype(np.float32)).to('cuda')

In [None]:
from torchviz import make_dot

yhat = net(dummy_x, dummy_meta, dummy_hidden)
make_dot(yhat, params = dict(list(net.named_parameters()) ), show_attrs=True,show_saved=True).render('something', format = 'png')

In [62]:
!pip install hiddenlayer

Collecting hiddenlayer
  Downloading hiddenlayer-0.3-py3-none-any.whl (19 kB)
Installing collected packages: hiddenlayer
Successfully installed hiddenlayer-0.3


In [None]:
import hiddenlayer as hl

transforms = [ hl.transforms.Prune('Constant') ] # Removes Constant nodes from graph.

graph = hl.build_graph(net, (dummy_x, dummy_meta, dummy_hidden), transforms=transforms)
graph.theme = hl.graph.THEMES['blue'].copy()
graph.save('rnn_hiddenlayer', format='png')

In [None]:
def error_retweet_predictor(model, batch_size = 359, device = 'cuda', GRU = False): 
  test_losses = []
  num_correct = []
  model.cuda()

  h = model.init_hidden(batch_size)
  criterion = nn.MSELoss()

  model.eval()
  for tweets, meta_data, labels in test_loader:
    if GRU == True: 
      h = h.data
    else: 
      h = tuple([each.data for each in h])
    tweets, meta_data, labels = tweets.to(device), meta_data.to(device), labels.to(device)
    output, h = model(tweets, meta_data, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())
        
    errors = torch.sum(torch.square(pred - labels.float().view_as(pred)), axis= 1)/(predicted_x.size()[0]
    num_correct.append(np.squeeze(errors.cpu().numpy()))    
  print("Test loss: {:.3f}".format(np.mean(test_losses)))
  print("Test accuracy: {:.3f}%".format(np.mean(num_correct)))