In [1]:
import torch
from covid import RobertaClassifier, SentimentData
from roberta_prediction import tokenize_covid_dataframe, regular_news_tokenize
import pandas as pd
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
Device name: Tesla K80


In [3]:
df_name = 'can-apr2021-apr2022.csv'
# df_name = 'us-mar-apr2022-headline.csv'
# df_name = 'us_news_commentary.csv'

df = pd.read_csv('csv/no_txt_' + df_name, parse_dates=['date'], index_col=[0]) 
print(f'Number of unique articles: {df.article_id.nunique()}')

Number of unique articles: 164481


**Regular News**

In [None]:
predicted_df = regular_news_tokenize(df)

In [None]:
predicted_df.to_csv('csv/analysis_' + df_name)
predicted_df.prediction.value_counts()

**Covid News**

In [None]:
subset_df, device, tokenizer, finetuned_rob  = tokenize_covid_dataframe(df, headlines=False)

## **Predicting with Roberta**

In [9]:
def rob_predict(model, testing_loader):
    model.eval()
    prediction_list = []
    score_list = []
    with torch.no_grad(): 
        for data in tqdm(testing_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = F.softmax(model(ids, mask, token_type_ids), dim=-1)
            score, prediction = torch.max(outputs.data, dim=1)
            prediction_list.extend(prediction.cpu().numpy())
            score_list.extend(score.cpu().numpy())
        return prediction_list , score_list
    
def run_model(dataframe):
    testing_set = SentimentData(dataframe, tokenizer , max_len = 500)
    testing_loader = DataLoader(testing_set, batch_size=15, num_workers=0) 
    prediction_list, score_list = rob_predict(finetuned_rob, testing_loader)
    interm = pd.DataFrame({'prediction': prediction_list,'score': score_list})
    test_df = pd.concat((dataframe, interm),axis=1)
    return test_df

In [None]:
%%time
predicted_df = run_model(dataframe=subset_df)

  1%|          | 334/53967 [17:11<46:01:19,  3.09s/it]

In [18]:
# df_name = 'can-apr2020-mar2021.csv'

In [19]:
predicted_df.to_csv('csv/analysis_' + df_name)
predicted_df.prediction.value_counts()

0    487933
1    321568
Name: prediction, dtype: int64

In [20]:
print('csv/analysis_' + df_name)

csv/analysis_can-apr2020-mar2021.csv
