In [34]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from tqdm import tqdm


%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 23
np.random.seed(RANDOM_SEED)

torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [35]:
device

device(type='cuda', index=0)

In [36]:
classes = {'FAVOR': np.array([1, 0, 0]), 'AGAINST': np.array([0, 1, 0]), 'NONE': np.array([0, 0, 1])}
classes_ = np.array(['FAVOR', 'AGAINST', 'NONE'])

In [37]:
def train_and_test(train_file, test_file, target):
    
    sentence_maxlen = 0
    x_train = []
    y_train = []

    
    with open(train_file, 'r') as trainfile:
        for line in trainfile:
            
            line = line.replace('#SemST', '').strip()
            line = line.split('\t')
            
            #if line[0].strip() != 'ID' and line[1].strip() == t:
            if line[0].strip() != 'ID' and target in line[1].strip():
                tweet = line[2]
                #tweet = process_tweet(tweet)
                if len(tweet) > sentence_maxlen:
                    sentence_maxlen = len(tweet)
                x_train.append(tweet)
                y_train.append(classes[line[3].strip()])
    

    pp = len(x_train)
    x_test = []
    y_test = []
    with open(test_file, 'r') as testfile:
        for line in testfile:
            line = line.replace('#SemST', '').strip()
            line = line.split('\t')
        

            #if line[0] != 'ID' and line[1] == t:
            if line[0] != 'ID' and target in line[1].strip():
                tweet = line[2]
                #tweet = process_tweet(tweet)
                if len(tweet) > sentence_maxlen:
                    sentence_maxlen = len(tweet)
                x_test.append(tweet)
                y_test.append(classes[line[3].strip()])


    
    return x_train, y_train, x_test, y_test, sentence_maxlen,pp





In [38]:
train_data_file_m = '/data/parush/stance_mohammed/train.txt'
test_data_file_m = '/data/parush/stance_mohammed/test.txt'
TARGETS_m = [ 'Atheism','Climate Change is a Real Concern', 'Feminist Movement','Hillary Clinton', 'Legalization of Abortion' ]


train_data_file_s = '/data/parush/SomasundaranWiebe-politicalDebates/train.txt'
test_data_file_s = '/data/parush/SomasundaranWiebe-politicalDebates/test.txt'
TARGETS_s = ['god','healthcare','guns','gayRights','abortion', 'creation']


train_data_file_q = '/data/parush/Data_MPCHI/train.txt'
test_data_file_q = '/data/parush/Data_MPCHI/test.txt'
TARGETS_q = ['Are E-Cigarettes safe?','Does MMR Vaccine lead to autism in children?',
      'Does Sunlight exposure lead to skin cancer?','Does Vitamin C prevent common cold?',
      'Should women take HRT post-menopause?']


In [39]:
# In[273]:


_, _, test_texts, test_labels, sen_maxlen, pp = train_and_test(train_data_file_m, test_data_file_m, TARGETS_m[4])




In [40]:
df_test = pd.DataFrame([])
df_test['TITLE'] = test_texts
df_test['target_list'] = test_labels

In [41]:
df_test

Unnamed: 0,TITLE,target_list
0,Need a ProLife R.E. Agent? - Support a ProLife...,"[0, 1, 0]"
1,Where is the childcare program @joanburton whi...,"[0, 1, 0]"
2,I get several requests with petitions to save ...,"[0, 1, 0]"
3,"we must always see others as Christ sees us,we...","[0, 1, 0]"
4,PRAYERS FOR BABIES Urgent prayer one in Lexing...,"[0, 1, 0]"
...,...,...
275,@MetalheadMonty @tom_six I followed him before...,"[0, 0, 1]"
276,"For he who avenges blood remembers, he does no...","[0, 1, 0]"
277,Life is sacred on all levels. Abortion does no...,"[0, 1, 0]"
278,"@ravensymone U refer to ""WE"" which =""YOU"" & a ...","[0, 1, 0]"


In [42]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [43]:
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe['TITLE']
        self.targets = self.data.target_list
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [44]:

test_set = CustomDataset(df_test,  tokenizer, MAX_LEN)

In [45]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

#training_loader = DataLoader(training_set, **train_params)
test_loader = DataLoader(test_set, **test_params)

In [46]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 3)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids,return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [47]:
model.load_state_dict(torch.load('/home/p/parush/bert_data/best_model_trained_on_abortion.pt')['state_dict'])

<All keys matched successfully>

In [48]:
model.eval()
test_targets, test_outputs = [] , []
with torch.no_grad():
    for batch_idx, data in enumerate(test_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)              
        test_targets.extend(targets.cpu().detach().numpy().tolist())
        test_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

test_preds = (np.array(test_outputs) > 0.5).astype(int)



print(classification_report(test_targets, test_preds,labels=[0,1], digits =4))

              precision    recall  f1-score   support

           0     0.4444    0.4348    0.4396        46
           1     0.7531    0.6455    0.6952       189

   micro avg     0.6860    0.6043    0.6425       235
   macro avg     0.5988    0.5401    0.5674       235
weighted avg     0.6927    0.6043    0.6451       235
 samples avg     0.4946    0.5071    0.4988       235



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
