In [20]:
import torch
from covid import RobertaClassifier, SentimentData
import pandas as pd
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [21]:
# df_name = 'NS_sep-dec2020.csv'
df_name ='NS_sep-dec2020.csv'
# df_name = 'corona_news_sep-mar2022.csv'

In [22]:
%%time
# df = pd.read_csv('covid_articles_no_txt.csv') 
# med_df = pd.read_csv('medical_covid_articles.csv')
df = pd.read_csv('csv/no_txt_' + df_name) 
print(f'Number of unique articles: {df.article_id.nunique()}')

Number of unique articles: 34725
CPU times: user 2.28 s, sys: 200 ms, total: 2.48 s
Wall time: 2.48 s


### Reformatting and tokenizing

In [23]:
df.columns[:2] # Drop these since they are just noise.

Index(['Unnamed: 0', 'article_id'], dtype='object')

In [24]:
df.drop(df.columns[:1],axis=1,inplace=True)
df['date'] = pd.to_datetime(df.date)
df.reset_index(inplace=True, drop=True)
df.dropna(inplace=True)
np.mean(df.text_len)

4651.042840185605

In [25]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Instantiate Finetuned Classifier
finetuned_rob = RobertaClassifier().to(device)
finetuned_rob.load_state_dict(torch.load('/home/ec2-user/SageMaker/pre_trained_model/covid_checkpoint (1).pth', map_location=device))
tokenizer = RobertaTokenizer.from_pretrained("/home/ec2-user/SageMaker/pre_trained_tokenizer")

There are 1 GPU(s) available.
Device name: Tesla K80




In [None]:
df['len_tokenized'] = df.pairs.progress_apply(lambda sent: len(tokenizer.encode(sent, add_special_tokens=True)))

  1%|▏         | 2719/203874 [00:01<01:58, 1691.82it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (707 > 512). Running this sequence through the model will result in indexing errors
 55%|█████▌    | 112576/203874 [00:56<00:43, 2087.59it/s]

In [None]:
# Can only remove these AFTER tokenizing. 
# bad_articles = df[df.len_tokenized >= 500].article_id
# bad_articles.to_csv('csv/bad_articles_'+ df_name)
df = df[df['len_tokenized'] < 500]

In [None]:
MAX_LEN = df.len_tokenized.max()
print(f'Max length of tokenized pair sentences: {MAX_LEN}')
print(f'Percentage of sentences with a tokenized length greater than 300 {len(df[df.len_tokenized > 300])/len(df)}.')

In [None]:
check_duplicates = np.load(file='duplicates.npy', allow_pickle=True)
print(f'Number of duplicate articles: {sum(df.article_id.isin(check_duplicates))}') # Checking for duplicate articles.
# df = df[~df.article_id.isin(check_duplicates)] # ONLY RUN THIS FOR TIMES OTHER THAN THE FIRST RUN.

**Using a subset of the original dataframe to speed up model runtime**

In [None]:
# subset_df = df[(df.date >= pd.Timestamp(2022,1,1)) & (df.date <= pd.Timestamp(2021,12,31))]
# subset_df = df[(df.date < pd.Timestamp(2021,1,1))]

# subset_df = df[(df.date < pd.Timestamp(2021,6,1))]
# second_subset_df = df[(df.date >= pd.Timestamp(2021,6,1))]

subset_df = df # Using entire timeline.
subset_df.reset_index(drop=True,inplace=True)
df.reset_index(drop=True,inplace=True)
print(subset_df.article_id.nunique())
print(df.article_id.nunique())

## **Predicting with Roberta**

In [None]:
def rob_predict(model, testing_loader):
    model.eval()
    prediction_list = []
    score_list = []
    with torch.no_grad(): 
        for data in tqdm(testing_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = F.softmax(model(ids, mask, token_type_ids), dim=-1)
            score, prediction = torch.max(outputs.data, dim=1)

            prediction_list.extend(prediction.cpu().numpy())
            score_list.extend(score.cpu().numpy())
        return prediction_list , score_list
    
def run_model(dataframe):
    testing_set = SentimentData(dataframe, tokenizer , max_len = MAX_LEN)
    testing_loader = DataLoader(testing_set, batch_size=10, num_workers=0) 
    prediction_list, score_list = rob_predict(finetuned_rob, testing_loader)
    interm = pd.DataFrame({'prediction': prediction_list,'score': score_list})
    test_df = pd.concat((dataframe, interm),axis=1)
    return test_df

In [None]:
%%time
predicted_df = run_model(dataframe = subset_df)

In [None]:
# df_name = 'B5_sep_dec2020.csv'
# df_name = 'B5_jan_mar2022.csv'
# df_name = 'Health'+ df_name # 2000 health articles.
# df_name ='B5_sep-dec2020.csv'
# df_name ='B5_jan-mar2022.csv'

predicted_df.to_csv('csv/analysis_' + df_name)
predicted_df.prediction.value_counts()

In [None]:
print('csv/analysis_' + df_name)