In [None]:
!pip install bcolz



In [None]:
import numpy as np 
import torch
import torch.nn as nn
import torch.nn.functional as F
from google.colab import files
import bcolz
import pickle
from torch.utils.data import TensorDataset, DataLoader

import csv
import json
import pandas as pd
from collections import Counter
import spacy
import re
nlp = spacy.load('en_core_web_sm')
#stopwords = nlp.Defaults.stop_words

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/505/project/'

data = pd.read_csv(f'{path}/data.csv')
print(data.shape)

(96562, 9)


In [None]:
tweets_count = []
for item in data['tweet']:
  n = len(str(item).split())
  tweets_count.append(n)

data['tweet_len'] = tweets_count
len(tweets_count)


96562

In [None]:
from collections import Counter
print('video:' , Counter(data.video))
print('photo:' , Counter(data.photo))
print('animated_gif:', Counter(data.animated_gif))

video: Counter({0: 96562})
photo: Counter({0: 88522, 1: 8040})
animated_gif: Counter({0: 96562})


In [None]:
data = data[data['tweet_len'] > 1]
needed = ['user_friends_count', 'user_followers_count', 'favorite_count',
       'user_statuses_count', 'tweet','retweets']
data = data[needed]
df = data.iloc[:96000]
print(df.shape)


(96000, 6)


In [None]:
df.head(2)

Unnamed: 0,user_friends_count,user_followers_count,favorite_count,user_statuses_count,tweet,retweets
0,13723,120597,0,153810,<user> hi no you can not withdraw funds but yo...,0
1,13723,120597,0,153810,<user> hi can you please share more informatio...,0


In [None]:
Counter(df.retweets).most_common(10)


[(0, 64830),
 (1, 8133),
 (2, 4240),
 (3, 3280),
 (4, 2523),
 (5, 2007),
 (6, 1506),
 (7, 1226),
 (8, 932),
 (9, 801)]

In [None]:
class_0 = df[df['retweets'] == 0]
class_not0 = df[df['retweets'] != 0]

In [None]:
class_0_under = class_0.sample(8133)
print(class_0_under.shape)

(8133, 6)


In [None]:
df2 = pd.concat([class_0_under, class_not0], axis=0)
df2.shape

(39303, 6)

In [None]:
df3 = df2.iloc[:32000]

In [None]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df3.iloc[:,:-1], df3['retweets'], test_size = 0.2, random_state=42)
print(len(X_train), len(X_test))

25600 6400


In [None]:
X_train_tweet = X_train.tweet
X_test_tweet = X_test.tweet

X_train_meta = X_train.iloc[:,:-1]
X_test_meta = X_test.iloc[:,:-1]

In [None]:
# Building vocabulary for training data
def word_count(data):
  words_counter = Counter()
  for line in data:
    words =  str(line).split()
    for w in words:
      words_counter.update([w])
  
  words_counter_clean = {k:v for k,v in words_counter.items() if v > 1} # Removing the words that only appear once
  sorted_words = sorted(words_counter_clean, key = words_counter_clean.get, reverse = True) # Sorting the words frequency in desc order
  sorted_words = ['UNK','PAD', '<s>', '</s>' ] + sorted_words 

  return words_counter, words_counter_clean, sorted_words
  
words_counter, words_counter_clean, sorted_words = word_count(X_train_tweet)

In [None]:
# Not using slicing window padding
def padding(data, seq_len):
  sequences = []
  for line in data:
    line = f"{'<s>'} {line} {'</s>'}"
    n_token = len(line.split())
    
    if n_token >= seq_len:
      seq = line.split()[:seq_len] 
      sequences.append(" ".join(seq))

    else:
      seq = line.split()
      for i in range(seq_len - n_token):
          seq.append('PAD')
      sequences.append(" ".join(seq))
  return sequences

X_train_pad = padding(X_train_tweet, 20)
X_test_pad = padding(X_test_tweet, 20)

In [None]:
# replace the words that only appear once with UNKNOWN
def generate_sentence(data):
  sequences = []
  for line in data:
    temp = []
    words = line.split()
    for word in words:
      if word in sorted_words:
        temp.append(word)
      else:
        temp.append('UNK')
    sequences.append(" ".join(temp))
  return sequences

X_train_final = generate_sentence(X_train_pad)
X_test_final = generate_sentence(X_test_pad)

In [None]:
# Using tweets training data vocabulary

# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(sorted_words)}
idx2word = {i:o for i,o in enumerate(sorted_words)}


# convert text sequences to integer sequences
X_train_int = np.zeros((len(X_train_final), 20), dtype = int)
for i, data in enumerate(X_train_final):
  X_train_int[i] = [word2idx[w] for w in data.split()]

X_test_int = np.zeros((len(X_test_final), 20), dtype = int)
for i, data in enumerate(X_test_final):
  X_test_int[i] = [word2idx[w] for w in data.split()]



In [None]:
# convert lists to numpy arrays
X_train_int = np.array(X_train_int)
y_train_int = np.array(y_train)

X_test_int = np.array(X_test_int)
y_test_int = np.array(y_test)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train_int), torch.from_numpy(X_train_meta.to_numpy()), torch.from_numpy(y_train_int))
test_data = TensorDataset(torch.from_numpy(X_test_int),torch.from_numpy(X_test_meta.to_numpy()), torch.from_numpy(y_test_int))

# dataloaders
batch_size = 256

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# Glove Embeddings

In [None]:
vectors = bcolz.open(f'{glove_path}/6B.100.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.100_words.pkl', 'rb'))
words += ['<UNK>', '<s>', '</s>', 'PAD']
vocab_list_glove = set(words)
new_vecs = np.random.normal(loc=0.0, scale=.6, size=(4,100) )
vectors = np.vstack((vectors, new_vecs))
word2idx = pickle.load(open(f'{glove_path}/6B.100_idx.pkl', 'rb'))
word2idx['<UNK>'] = 400000
word2idx['<s>'] = 400001
word2idx['</s>'] = 400002
word2idx['PAD'] = 400003

In [None]:
# Using glove weights
glove = {w: vectors[word2idx[w]] for w in words}
matrix_len = len(sorted_words)
weights_matrix = np.zeros((matrix_len, 100))
words_found = 0

for i, word in enumerate(sorted_words):
  try: 
    weights_matrix[i] = glove[word] # if alr in the vocab, load its pre-trained word vector.
    words_found += 1
  except KeyError:
    weights_matrix[i] = np.random.normal(scale=0.6, size=(100, ))

# Neural Net

### Retweet Network: Takes in a tweet as input, can use embedded version, and can any combination of bidirectional, LSTM, GRU, concatenates it with metadata vector, and uses a feedforward neural net with 1 hidden layer to perform a regression prediction on the retweet count. 

#### Parameter custom_embeddings is either a tuple: (weight_matrix , none_trainable), or None.
#### none_trainable is either True or False or Nothing

In [None]:
def create_emb_layer(weights_matrix, non_trainable=False):
  num_embeddings, embedding_dim = weights_matrix.shape
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
  emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
  if non_trainable:
      emb_layer.weight.requires_grad = False
  return emb_layer, num_embeddings, embedding_dim

class RetweetNet(nn.Module):
  def __init__(self, vocab_size, hidden_state_sizes, meta_data_len, output_size, embedding_dim, hidden_dim, 
                 n_layers, drop_prob=0.5, custom_embeddings = None, bidirectional = False, GRU = False):
    super().__init__()
    self.GRU_val = GRU
    self.bidirectional = bidirectional
    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
        
    if custom_embeddings is None: 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    else: 
        assert len(custom_embeddings) == 2 and isinstance(custom_embeddings, tuple), "custom embeddings must be of form: (weight_matrix, non_trainable)"
        weights_matrix, non_trainable = custom_embeddings
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, non_trainable)
        
    if GRU == False: 
        self.Gate = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
    else: 
        self.Gate = nn.GRU(embedding_dim, hidden_dim, n_layers, 
                              dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
    self.dropout = nn.Dropout(0.2)
    self.fc1 = nn.Linear(hidden_dim, hidden_state_sizes[0])
    self.relu = nn.ReLU()
        
    #hidden_state_sizes[0] is the size of the output of lstm 
    self.fc2 = nn.Linear(hidden_state_sizes[0] + meta_data_len, hidden_state_sizes[1])
        
    #hidden_state_sizes[1] is the size of the first and only hidden layer
    self.fc3 = nn.Linear(hidden_state_sizes[1], 1)

        
  def forward(self, x, meta_data, hidden):
    batch_size = x.size(0)
    x = x.long()
    embeds = self.embedding(x)
    gru_out, hidden = self.Gate(embeds, hidden)
    gru_out = gru_out.contiguous().view(-1, self.hidden_dim)
    
    out = self.dropout(gru_out)
    out = self.fc1(out)

    out = out.view(batch_size, -1, self.hidden_dim)
    out = out[:,-1, :] 

    # combine hidden state and meta_data
    ################# ################# #################
    #out = torch.cat((out, meta_data), dim = 1) #meta_data is of shape (batch_size, -1)
        
    out = self.fc2(out)
        
    # applying dropout before relu since relu already sets some neurons to 0
    out = self.dropout(out)
    out = self.relu(out)
    out = self.fc3(out)
        
    return out, hidden
    
  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    n = 1
    if self.bidirectional == True: 
      n = 2
    if self.GRU_val == False:
      return (weight.new(self.n_layers * n, batch_size, self.hidden_dim).zero_().to('cuda'),
              weight.new(self.n_layers * n, batch_size, self.hidden_dim).zero_().to('cuda'))
    else:
      return  weight.new(self.n_layers * n, batch_size, self.hidden_dim).zero_().to('cuda')


def train_retweet_predictor(model, epochs = 100, print_every = 1000, clip = 5, valid_loss_min = np.Inf, lr=0.005, batch_size = 400, device = 'cuda', GRU = False, weight_decay = 1e-5): 
  counter = 0
  print_every = 200
  model.train()
    
  criterion = nn.MSELoss()
    
  # weight decay is the l2 regularization penalty 
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)


  training_stats = []
  for i in range(epochs):
    total_train_loss = 0

    h = model.init_hidden(batch_size)
    for tweets, meta_data, labels in train_loader:
      counter += 1
      if GRU == False: 
        h = tuple([each.data for each in h])
      else:
        h = h.data
      tweets, meta_data, labels = tweets.to(device), meta_data.to(device), labels.to(device)
      model.zero_grad()
      output, h = model(tweets, meta_data, h)
      loss = criterion(output.squeeze(), labels.float())
      loss.backward()
      total_train_loss += loss.item()
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      optimizer.step()


      if counter % print_every == 0:

        print("Epoch: {}/{}...".format(i+1, epochs),
            "Step: {}...".format(counter),
            "Loss: {:.6f}...".format(loss.item()))
      

    #avg_train_loss = total_train_loss / len(train_loader)  
    #training_stats.append({'epoch': i + 1, 'Training Loss': avg_train_loss})
  
  #return training_stats
      



In [None]:
vocab_size = len(sorted_words)
output_size = len(X_train_int)
embedding_dim = 100
hidden_dim = 256
n_layers = 2

net = RetweetNet(vocab_size = vocab_size, hidden_state_sizes = [256,128], meta_data_len = 0, output_size = output_size, embedding_dim = embedding_dim, hidden_dim  = hidden_dim, 
                 n_layers =n_layers, GRU = True, bidirectional = True)
net.cuda()

RetweetNet(
  (embedding): Embedding(10429, 100)
  (Gate): GRU(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
train_retweet_predictor(net, epochs = 10, batch_size = 256, device = 'cuda', lr = 1e-06,GRU = True)

Epoch: 2/10... Step: 200... Loss: 165.845245...
Epoch: 4/10... Step: 400... Loss: 2252.553223...
Epoch: 6/10... Step: 600... Loss: 745.528320...
Epoch: 8/10... Step: 800... Loss: 664.221497...
Epoch: 10/10... Step: 1000... Loss: 92333.000000...


In [None]:
def error_retweet_predictor(model, batch_size = 359, device = 'cuda', GRU = False): 
  test_losses = []
  num_correct = 0
  model.cuda()

  h = model.init_hidden(batch_size)
  criterion = nn.MSELoss()
  

  model.eval()
  for tweets, meta_data, labels in test_loader:
    if GRU == True: 
      h = h.data
    else: 
      h = tuple([each.data for each in h])
    tweets, meta_data, labels = tweets.to(device), meta_data.to(device), labels.to(device)
    output, h = model(tweets, meta_data, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())
    
    correct_tensor = pred.eq(labels.float().view_as(pred))

    correct = np.squeeze(correct_tensor.cpu().numpy())

    num_correct += np.sum(correct)   
   
  test_acc = num_correct/len(test_loader.dataset)
  print(num_correct, len(test_loader.dataset))
  print("Test loss: {:.3f}".format(np.mean(test_losses)))
  print("Test accuracy: {:.3f}".format(test_acc))

In [None]:
error_retweet_predictor(net, batch_size = 256, device = 'cuda', GRU = True) 

1619 6400
Test loss: 1469.236
Test accuracy: 0.253


### Visualize RetweetNet

In [None]:
!pip install torchviz

Collecting torchviz
  Downloading torchviz-0.0.2.tar.gz (4.9 kB)
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... [?25l[?25hdone
  Created wheel for torchviz: filename=torchviz-0.0.2-py3-none-any.whl size=4150 sha256=0108fda8d4a39f8e36420853eb1fcd356cf1ac971fcd7e5b8c065ef75d9066f5
  Stored in directory: /root/.cache/pip/wheels/04/38/f5/dc4f85c3909051823df49901e72015d2d750bd26b086480ec2
Successfully built torchviz
Installing collected packages: torchviz
Successfully installed torchviz-0.0.2


In [None]:
dummy_x = torch.from_numpy(np.zeros((256,20)).astype(np.int64)).to('cuda')
dummy_meta = torch.from_numpy(np.zeros((256,8)).astype(np.int64)).to('cuda')
dummy_hidden = torch.from_numpy(np.zeros((2, 256, 256)).astype(np.float32)).to('cuda')

In [None]:
from torchviz import make_dot

yhat = net(dummy_x, dummy_meta, dummy_hidden)
make_dot(yhat, params = dict(list(net.named_parameters()) ), show_attrs=True,show_saved=True).render('something', format = 'png')

In [None]:
!pip install hiddenlayer

Collecting hiddenlayer
  Downloading hiddenlayer-0.3-py3-none-any.whl (19 kB)
Installing collected packages: hiddenlayer
Successfully installed hiddenlayer-0.3


In [None]:
import hiddenlayer as hl

transforms = [ hl.transforms.Prune('Constant') ] # Removes Constant nodes from graph.

graph = hl.build_graph(net, (dummy_x, dummy_meta, dummy_hidden), transforms=transforms)
graph.theme = hl.graph.THEMES['blue'].copy()
graph.save('rnn_hiddenlayer', format='png')