In [1]:
import torch

import torch.nn as nn
from torch.utils import data
from transformers import RobertaTokenizer, RobertaModel

# bs64_lr3.0e-02_drop0_nlayer3_hiddim256

args = {
    'batch_size': 64,
    'lr': .0e-02,
    'hidden_dim': 256,
    'n_layers': 3,
    'bidirectional': True,
    'dropout': 0,
    'n_epochs': None,
    'b1': None,
    'b2': None,
    'weight_decay': None,
    'lr_decay': 0.7
}

In [2]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [3]:
import pandas as pd

df = pd.read_csv("../data/hydrated_data_ieee/final_data/shallow_processed.csv")
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,text,retweet_count,favorite_count,possibly_sensitive,withheld_in_countries,user.id,user.name,...,user.listed_count,user.created_at,user.favourites_count,user.statuses_count,user.default_profile,user.default_profile_image,user.withheld_in_countries,date,sentiment_score,expanded_text
0,0,0,1.240728e+18,holy shit I hate every person in this video,70104,0,,,2.022756e+08,Typreme Kicks.👟❤️📈🤙🏽,...,9,Wed Oct 13 17:48:24 +0000 2010,89995,63405,True,False,,Mar 19,-0.500000,holy shit I hate every person in this video
1,1,1,1.240728e+18,There you have it.\r\n\r\nProof that this is a...,1662,0,,,3.524761e+08,Jake,...,3,Wed Aug 10 17:37:59 +0000 2011,9823,21872,False,False,,Mar 19,0.000000,There you have it.\r\n\r\nProof that this is a...
2,2,2,1.240728e+18,"Among other things, what this shows is they're...",6624,0,,,9.281008e+17,Fifty Pound Head,...,0,Wed Nov 08 03:24:12 +0000 2017,1651,1617,False,False,,Mar 19,-0.062500,"Among other things, what this shows is they ar..."
3,3,3,1.240729e+18,this incompetent racist asshole,14445,0,False,,1.585784e+07,Brad,...,2,Fri Aug 15 00:33:06 +0000 2008,8538,11553,False,False,,Mar 19,-0.350000,this incompetent racist asshole
4,4,4,1.240729e+18,"Among other things, what this shows is they're...",6624,0,,,1.001945e+18,Pamela ( No DMs),...,3,Wed May 30 21:56:16 +0000 2018,190490,99423,True,False,,Mar 19,-0.062500,"Among other things, what this shows is they ar..."
5,5,5,1.240729e+18,[someone] Tacking on Chinese is just redundant...,0,11,,,2.923801e+09,Blumen,...,0,Mon Dec 15 22:41:21 +0000 2014,30498,6876,False,False,,Mar 19,-0.066667,[someone] Tacking on Chinese is just redundant...
6,6,6,1.240729e+18,this is my shocked face that paying big money ...,3,0,,,3.265605e+08,Billy Sierra,...,12,Thu Jun 30 03:31:26 +0000 2011,44343,16186,True,False,,Mar 19,0.000000,this is my shocked face that paying big money ...
7,7,7,1.240729e+18,"Corona please. We are tired, we get it.",29077,0,,,1.171508e+18,jojo,...,0,Tue Sep 10 19:37:59 +0000 2019,11900,5393,True,False,,Mar 19,-0.400000,"Corona please. We are tired, we get it."
8,8,8,1.240729e+18,Interviewing a bottle of hand sanitizer in 2020,54405,0,False,,2.359074e+09,Mó,...,7,Sat Feb 22 18:21:04 +0000 2014,110778,63461,False,False,,Mar 19,-0.500000,Interviewing a bottle of hand sanitizer in 2020
9,9,9,1.240729e+18,This is Generation Z. \r\n\r\nI want to name t...,104047,0,,,2.476930e+09,Destinoo✨,...,1,Sun May 04 13:50:16 +0000 2014,25510,27642,True,False,,Mar 19,0.500000,This is Generation Z. \r\n\r\nI want to name t...


In [4]:
tokenized = df["expanded_text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [5]:
def get_max_len(tokenized):
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    return max_len

max_len = get_max_len(tokenized)
padded = torch.tensor([i + [0] * (max_len - len(i)) for i in tokenized.values])

In [6]:
class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, x):
        'Initialization'
        self.x = x

    def __len__(self):
        'Denotes the total number of samples'
        return self.x.shape[0]

    def __getitem__(self, index):
        'Generates one sample of data'
        x = self.x[index]

        return x

In [24]:
dataset = Dataset(padded)

In [25]:
dataset_loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=args["batch_size"],
                                             shuffle=False,
                                             drop_last=False)

In [9]:
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        # embedding_dim = bert.config.to_dict()['dim']
        embedding_dim = 768
    
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        attention_mask = text.masked_fill(text != 0, 1)
                
        with torch.no_grad():
            embedded = self.bert(text, attention_mask=attention_mask)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [15]:
bert = RobertaModel.from_pretrained('roberta-base')

model = BERTGRUSentiment(bert,
                         args['hidden_dim'],
                         3,
                         args['n_layers'],
                         args['bidirectional'],
                         args['dropout']).to(device)

In [19]:
model.load_state_dict(torch.load(f"../experiments/unprocessed_bs64_lr3.0e-02_drop0_nlayer3_hiddim256/bs64_lr3.0e-02_drop0_nlayer3_hiddim256.pt"))

<All keys matched successfully>

In [26]:
result = []
softmax = torch.nn.Softmax(dim=1)

def get_prediction(data):
    pred = model(data)
    pred_prob = softmax(pred)
    pred_hard = torch.max(pred_prob, dim=1)
    return pred_hard

from tdqm import tqdm

for batch_idx, data in enumerate(tqdm(dataset_loader)):
    data = data.to(device)
    with torch.no_grad():
        result.append(get_prediction(data))

TypeError: expected Tensor as element 0 in argument 0, but got torch.return_types.max

In [27]:
result

[torch.return_types.max(
 values=tensor([0.4480, 0.6534, 0.7322, 0.5845, 0.7322, 0.5796, 0.7719, 0.5019, 0.8598,
         0.4979, 0.4762, 0.6057, 0.4480, 0.6261, 0.7060, 0.5257, 0.7440, 0.4889,
         0.7741, 0.9649, 0.4480, 0.8802, 0.7696, 0.5387, 0.6247, 0.6773, 0.5128,
         0.7410, 0.7959, 0.5848, 0.5128, 0.6823, 0.4792, 0.9108, 0.4979, 0.5845,
         0.7169, 0.4979, 0.4466, 0.9078, 0.8802, 0.7322, 0.4556, 0.4979, 0.7696,
         0.5173, 0.5128, 0.8539, 0.5183, 0.4686, 0.5015, 0.5151, 0.7696, 0.7540,
         0.6336, 0.5128, 0.8176, 0.5936, 0.7696, 0.5139, 0.5168, 0.5482, 0.8846,
         0.8824], device='cuda:0'),
 indices=tensor([0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
         1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
         1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1], device='cuda:0')),
 torch.return_types.max(
 values=tensor([0.4697, 0.8457, 0.7322, 0.6336, 0.9016, 0.9095, 0.6667, 0.6489, 0.7710,
     

In [31]:
result[-1].indices.size()

torch.Size([54])

In [37]:
a = torch.tensor([])
a.size()

torch.Size([0])

In [33]:
torch.cat((a, a), 0)

tensor([1, 2, 3, 1, 2, 3])

In [56]:
predicted_indices = torch.tensor([], dtype=torch.int64).to(device)

for torch_ret_max in result:
    predicted_indices = torch.cat((predicted_indices, torch_ret_max.indices), 0)

In [58]:
import numpy as np

predicted_indices = predicted_indices.cpu().numpy()

In [62]:
len(predicted_indices) == len(df)

True

In [63]:
df["predicted_sentiment_indice"] = predicted_indices

In [64]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,text,retweet_count,favorite_count,possibly_sensitive,withheld_in_countries,user.id,user.name,...,user.created_at,user.favourites_count,user.statuses_count,user.default_profile,user.default_profile_image,user.withheld_in_countries,date,sentiment_score,expanded_text,predicted_sentiment_indice
0,0,0,1.240728e+18,holy shit I hate every person in this video,70104,0,,,2.022756e+08,Typreme Kicks.👟❤️📈🤙🏽,...,Wed Oct 13 17:48:24 +0000 2010,89995,63405,True,False,,Mar 19,-0.500000,holy shit I hate every person in this video,0
1,1,1,1.240728e+18,There you have it.\r\n\r\nProof that this is a...,1662,0,,,3.524761e+08,Jake,...,Wed Aug 10 17:37:59 +0000 2011,9823,21872,False,False,,Mar 19,0.000000,There you have it.\r\n\r\nProof that this is a...,1
2,2,2,1.240728e+18,"Among other things, what this shows is they're...",6624,0,,,9.281008e+17,Fifty Pound Head,...,Wed Nov 08 03:24:12 +0000 2017,1651,1617,False,False,,Mar 19,-0.062500,"Among other things, what this shows is they ar...",1
3,3,3,1.240729e+18,this incompetent racist asshole,14445,0,False,,1.585784e+07,Brad,...,Fri Aug 15 00:33:06 +0000 2008,8538,11553,False,False,,Mar 19,-0.350000,this incompetent racist asshole,1
4,4,4,1.240729e+18,"Among other things, what this shows is they're...",6624,0,,,1.001945e+18,Pamela ( No DMs),...,Wed May 30 21:56:16 +0000 2018,190490,99423,True,False,,Mar 19,-0.062500,"Among other things, what this shows is they ar...",1
5,5,5,1.240729e+18,[someone] Tacking on Chinese is just redundant...,0,11,,,2.923801e+09,Blumen,...,Mon Dec 15 22:41:21 +0000 2014,30498,6876,False,False,,Mar 19,-0.066667,[someone] Tacking on Chinese is just redundant...,0
6,6,6,1.240729e+18,this is my shocked face that paying big money ...,3,0,,,3.265605e+08,Billy Sierra,...,Thu Jun 30 03:31:26 +0000 2011,44343,16186,True,False,,Mar 19,0.000000,this is my shocked face that paying big money ...,0
7,7,7,1.240729e+18,"Corona please. We are tired, we get it.",29077,0,,,1.171508e+18,jojo,...,Tue Sep 10 19:37:59 +0000 2019,11900,5393,True,False,,Mar 19,-0.400000,"Corona please. We are tired, we get it.",1
8,8,8,1.240729e+18,Interviewing a bottle of hand sanitizer in 2020,54405,0,False,,2.359074e+09,Mó,...,Sat Feb 22 18:21:04 +0000 2014,110778,63461,False,False,,Mar 19,-0.500000,Interviewing a bottle of hand sanitizer in 2020,1
9,9,9,1.240729e+18,This is Generation Z. \r\n\r\nI want to name t...,104047,0,,,2.476930e+09,Destinoo✨,...,Sun May 04 13:50:16 +0000 2014,25510,27642,True,False,,Mar 19,0.500000,This is Generation Z. \r\n\r\nI want to name t...,1


In [65]:
df.to_csv("../data/hydrated_data_ieee/final_data/shallow_processed_with_prediction.csv")