In [1]:
import torch
from covid import RobertaClassifier, SentimentData
import pandas as pd
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
Device name: Tesla K80


In [3]:
df_name = 'can-apr2021-apr2022.csv'
# df_name = 'us-mar-apr2022-headline.csv'
# df_name = 'us_news_commentary.csv'

df = pd.read_csv('csv/no_txt_' + df_name, parse_dates=['date'], index_col=[0]) 
print(f'Number of unique articles: {df.article_id.nunique()}')

Number of unique articles: 164481


### Reformatting and tokenizing

In [4]:
df.page_num.fillna('None', inplace=True)
assert (df.isna().sum() == 0).all() == True

article_id    0
date          0
publisher     0
title         0
page_num      0
pairs         0
dtype: int64

In [5]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Instantiate Finetuned Classifier
finetuned_rob = RobertaClassifier().to(device)
finetuned_rob.load_state_dict(torch.load('/home/ec2-user/SageMaker/pre_trained_model/covid_checkpoint.pth', map_location=device));
tokenizer = RobertaTokenizer.from_pretrained("/home/ec2-user/SageMaker/pre_trained_tokenizer")


def tokenizer_mp(sent):
    return len(tokenizer.encode(sent, add_special_tokens=True))

There are 1 GPU(s) available.
Device name: Tesla K80




In [6]:
##------------------------------ USE THIS FOR Covid ------------------------------
with mp.Pool(mp.cpu_count()) as pool:
    df['len_tokenized'] = pool.map(tokenizer_mp, df['pairs'])    
    
df = df[(df['len_tokenized'] < 500) & (df['len_tokenized'] > 20)]
df['sentences'] = df['pairs']

Token indices sequence length is longer than the specified maximum sequence length for this model (783 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (600 > 512). Running this sequence through the model will result in indexing errors


In [None]:
##------------------------------ USE THIS FOR TITLES ------------------------------

with mp.Pool(mp.cpu_count()) as pool:
    df['len_tokenized'] = pool.map(tokenizer_mp, df['title'])    
    
# For TITLES:
df = df[df['len_tokenized'] < 500]
df['sentences'] = df['title']

In [None]:
MAX_LEN = df.len_tokenized.max()
print(f'Max length of tokenized pair sentences: {MAX_LEN}')
print(f'Percentage of sentences with a tokenized length greater than 300 {len(df[df.len_tokenized > 300])/len(df)}.')

**Using a subset of the original dataframe to speed up model runtime**

In [8]:
subset_df = df # Using entire timeline.
print(subset_df.article_id.nunique())
print(df.article_id.nunique())
subset_df.reset_index(drop=True, inplace=True)

164116
164116


## **Predicting with Roberta**

In [9]:
def rob_predict(model, testing_loader):
    model.eval()
    prediction_list = []
    score_list = []
    with torch.no_grad(): 
        for data in tqdm(testing_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = F.softmax(model(ids, mask, token_type_ids), dim=-1)
            score, prediction = torch.max(outputs.data, dim=1)
            prediction_list.extend(prediction.cpu().numpy())
            score_list.extend(score.cpu().numpy())
        return prediction_list , score_list
    
def run_model(dataframe):
    testing_set = SentimentData(dataframe, tokenizer , max_len = 500)
    testing_loader = DataLoader(testing_set, batch_size=15, num_workers=0) 
    prediction_list, score_list = rob_predict(finetuned_rob, testing_loader)
    interm = pd.DataFrame({'prediction': prediction_list,'score': score_list})
    test_df = pd.concat((dataframe, interm),axis=1)
    return test_df

In [None]:
%%time
predicted_df = run_model(dataframe = subset_df)

  1%|          | 334/53967 [17:11<46:01:19,  3.09s/it]

In [18]:
df_name = 'can-apr2020-mar2021.csv'

In [19]:
predicted_df.to_csv('csv/analysis_' + df_name)
predicted_df.prediction.value_counts()

0    487933
1    321568
Name: prediction, dtype: int64

In [20]:
print('csv/analysis_' + df_name)

csv/analysis_can-apr2020-mar2021.csv
