# Introduction

This notebook can be used to run experiments that trains RoBERTa model on different split of AG News Corpus dataset. The main aim is to train a model that will classify the news articles into one of the four categories,
1. World News
2. Sports News
3. Business News
4. Science / Technology News

# Libraries Needed

In [None]:
# !pip install simpletransformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import torch

# from simpletransformers.classification import ClassificationModel, ClassificationArgs
# import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
from IPython.display import display
from sklearn.metrics import accuracy_score
import os
import torch
import torch.nn as nn
# import wandb
import json

In [None]:
torch.cuda.is_available()

True

# Data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/fsdl_project/train.csv", index_col=False)

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/test.csv", index_col=False)

In [None]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


1-World

2-Sports

3-Business 

4-Sci/Tech

In [None]:
df['text'] = df['Title'].str.lower() + " " + df['Description'].str.lower()
df['labels'] = df['Class Index'] - 1

test_df['text'] = test_df['Title'].str.lower() + " " + test_df['Description'].str.lower()
test_df['labels'] = test_df['Class Index'] - 1

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


In [None]:
display(df)
display(test_df)

Unnamed: 0,Class Index,Title,Description,text,labels
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",wall st. bears claw back into the black (reute...,2
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,carlyle looks toward commercial aerospace (reu...,2
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,oil and economy cloud stocks' outlook (reuters...,2
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,iraq halts oil exports from main southern pipe...,2
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","oil prices soar to all-time record, posing new...",2
...,...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,pakistan's musharraf says won't quit as army c...,0
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,renteria signing a top-shelf deal red sox gene...,1
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,saban not going to dolphins yet the miami dolp...,1
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,today's nfl games pittsburgh at ny giants time...,1


Unnamed: 0,Class Index,Title,Description,text,labels
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...,fears for t n pension after talks unions repre...,2
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",the race is on: second private team sets launc...,3
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,ky. company wins grant to study peptides (ap) ...,3
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,prediction unit helps forecast wildfires (ap) ...,3
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,calif. aims to limit farm-related smog (ap) ap...,3
...,...,...,...,...,...
7595,1,Around the world,Ukrainian presidential candidate Viktor Yushch...,around the world ukrainian presidential candid...,0
7596,2,Void is filled with Clement,With the supply of attractive pitching options...,void is filled with clement with the supply of...,1
7597,2,Martinez leaves bitter,Like Roger Clemens did almost exactly eight ye...,martinez leaves bitter like roger clemens did ...,1
7598,3,5 of arthritis patients in Singapore take Bext...,SINGAPORE : Doctors in the United States have ...,5 of arthritis patients in singapore take bext...,2


In [None]:
np.random.seed(100)
train_idx = np.random.choice(df.index, size=int(df.shape[0]*0.8), replace=False)
valid_idx = set(df.index) - set(train_idx)

train_df = df[df.index.isin(train_idx)]
valid_df = df[df.index.isin(valid_idx)]

In [None]:
train_df.shape[0] + valid_df.shape[0] == df.shape[0]

True

In [None]:
os.mkdir("/content/drive/MyDrive/fsdl_project/data")

In [None]:
os.mkdir("/content/drive/MyDrive/fsdl_project/data/baseline")

In [None]:
os.mkdir("/content/drive/MyDrive/fsdl_project/data/active_learning")

In [None]:
train_df.to_csv("/content/drive/MyDrive/fsdl_project/data/baseline/train.csv.gz", sep="|", index=False, compression="gzip")
valid_df.to_csv("/content/drive/MyDrive/fsdl_project/data/baseline/valid.csv.gz", sep="|", index=False, compression="gzip")
test_df.to_csv("/content/drive/MyDrive/fsdl_project/data/baseline/test.csv.gz", sep="|", index=False, compression="gzip")

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/baseline/train.csv.gz", sep="|", index_col=False)
valid_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/baseline/valid.csv.gz", sep="|", index_col=False)
test_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/baseline/test.csv.gz", sep="|", index_col=False)

In [None]:
train_df.head()

Unnamed: 0,Class Index,Title,Description,text,labels
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","reuters - short-sellers, wall street's dwindli...",2
1,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,reuters - soaring crude prices plus worries\ab...,2
2,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,reuters - authorities have halted oil export\f...,2
3,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","afp - tearaway world oil prices, toppling reco...",2
4,3,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...,reuters - stocks ended slightly higher on frid...,2


In [None]:
train_df['text'] = train_df['Description'].str.lower()
valid_df['text'] = valid_df['Description'].str.lower()
test_df['text'] = test_df['Description'].str.lower()

# Model

In [None]:
## Training RoBERTa on full training set to obtain baseline accuracy on test data

# Optional model configuration

model_args = ClassificationArgs(num_train_epochs=5, 
                                overwrite_output_dir= True, 
                                train_batch_size=16,
                                max_seq_length=250, 
                                wandb_project= 'active_learning_baseline_v2', 
                                best_model_dir="/content/drive/MyDrive/fsdl_project/model/baseline/best_model/20210418",
                                cache_dir="/content/drive/MyDrive/fsdl_project/cache/baseline/20210418",
                                eval_batch_size=16,
                                evaluate_during_training=True,
                                evaluate_during_training_verbose=True,
                                manual_seed=100,
                                output_dir="content/drive/MyDrive/fsdl_project/output/baseline/20210418",
                                use_early_stopping=True,
                                early_stopping_patience=3,
                                )


# Create a ClassificationModel
model = ClassificationModel(
"roberta", "roberta-base", args=model_args, use_cuda=True, num_labels=4,
)

In [None]:
model.train_model(train_df=train_df, eval_df=valid_df, accuracy=accuracy_score)

In [None]:
## Loading model for inference
# Create a ClassificationModel
model = ClassificationModel(
"roberta", "/content/drive/MyDrive/fsdl_project" ,
)

In [None]:
valid_result, valid_model_outputs, valid_wrong_predictions = model.eval_model(valid_df, accuracy=accuracy_score)

In [None]:
train_result, train_model_outputs, train_wrong_predictions = model.eval_model(train_df, accuracy=accuracy_score)

In [None]:
sf = nn.Softmax(dim=1)

In [None]:
np.mean(torch.argmax(sf(torch.tensor(model_outputs)), dim=1).numpy() == valid_df['labels'].values.ravel())

0.941625

In [None]:
wandb.log({'best_train_accuracy': })

In [None]:
test_df.head()

Unnamed: 0,Class Index,Title,Description,text,labels
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...,fears for t n pension after talks unions repre...,2
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",the race is on: second private team sets launc...,3
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,ky. company wins grant to study peptides (ap) ...,3
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,prediction unit helps forecast wildfires (ap) ...,3
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,calif. aims to limit farm-related smog (ap) ap...,3


In [None]:

test_result, test_model_outputs, test_wrong_predictions = model.eval_model(test_df, accuracy = accuracy_score)

In [None]:
log_to_wandb

{'test_accuracy': 0.9388157894736842,
 'test_eval_loss': 0.24408445681959978,
 'test_mcc': 0.9184682069651259,
 'train_accuracy': 0.96821875,
 'train_eval_loss': 0.11901225684736952,
 'train_mcc': 0.9576493149383133,
 'valid_eval_loss': 0.22651522111527933,
 'valid_mcc': 0.9222124561368541}

In [None]:
log_to_wandb = {f'test_{key}': item for key, item in test_result.items()}
log_to_wandb.update({f'train_{key}': item for key, item in train_result.items()})
log_to_wandb.update({f'valid_{key}': item for key, item in result.items()})

In [None]:
import json

In [None]:
import os

In [None]:
os.mkdir("/content/drive/MyDrive/fsdl_project/baseline")

In [None]:
with open("/content/drive/MyDrive/fsdl_project/baseline/exp_stats.json", 'w') as outfile:
    json.dump(log_to_wandb, outfile, indent=4)

In [None]:
wandb.log(log_to_wandb)

In [None]:
train_df = df.sample(int(df.shape[0]*0.1))

In [None]:
train_df.head()

Unnamed: 0,Class Index,Title,Description
40760,1,"Saudi Troops, Gunmen Clash in Riyadh","Saudi security forces, battling a wave of terr..."
85670,1,Rebels Kill 45 in Attacks in Iraq #39;s Baquba,Insurgent attacks and clashes killed 45 people...
114608,1,"Arab press roundup: December 13, 2004",Arab newspaper highlighted and commented on PL...
91544,2,NCAA Wins Right to Limit Tournaments,New Mexico #39;s Mark Walters (5) is almost tr...
113198,4,"Netflix CEO Rates Blockbuster, Amazon Threats ...",Reuters - Netflix Inc chief\executive Reed Has...


Training on reduced dataset for active learning

In [None]:
np.random.seed(100)
train_al_idx = np.random.choice(train_df.index, size=int(train_df.shape[0]*0.3), replace=False)
annotate_idx = list(set(train_df.index) - set(train_al_idx))
train_df_al = train_df[train_df.index.isin(train_al_idx)]
annotate_df = train_df[train_df.index.isin(annotate_idx)]

In [None]:
display(set(annotate_idx).intersection(train_al_idx))
display(annotate_df.shape[0] + train_df_al.shape[0] == train_df.shape[0])

set()

True

In [None]:
annotate_df['idx'] = annotate_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
annotate_df[annotate_df['text'].str.contains("#name?")].sort_values("text")

Unnamed: 0,Class Index,Title,Description,text,labels,idx
14245,2,Sabres agree to terms with 2003 first-round pi...,#NAME?,#name?,1,14245
18296,2,Top of 3rd,#NAME?,#name?,1,18296
19109,2,"Blues re-sign D Backman, four others",#NAME?,#name?,1,19109
22140,2,Wild re-sign D Schultz,#NAME?,#name?,1,22140
23174,2,Predators re-sign D Zidlicky,#NAME?,#name?,1,23174
36665,2,"- UMPIRES: Home,Andy Fletcher; First, Tim Welk...",#NAME?,#name?,1,36665
80894,1,Lynching of agents puts Mexico focus on vigila...,#NAME?,#name?,0,80894


In [None]:
annotate_df.loc[annotate_df.idx == 9566, 'text'].values[0]

'coach joins the s p 500, and others stand to benefit from the leather in the weather.'

In [None]:
annotate_df[annotate_df['text'].str.contains()]

In [None]:
## Loading all data to get accuracy scores on them, logits, and probability

train_df_al = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/train.csv.gz", sep="|", index_col=False)
valid_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/valid.csv.gz", sep="|", index_col=False)
test_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/test.csv.gz", sep="|", index_col=False)
annotate_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/annotate.csv.gz", sep="|", index_col=False)

FileNotFoundError: ignored

In [None]:
train_df_al.head()

Unnamed: 0,Class Index,Title,Description,text,labels
0,3,Money Funds Fell in Latest Week (AP),AP - Assets of the nation's retail money marke...,ap - assets of the nation's retail money marke...,2
1,3,Fed minutes show dissent over inflation (USATO...,USATODAY.com - Retail sales bounced back a bit...,usatoday.com - retail sales bounced back a bit...,2
2,3,Safety Net (Forbes.com),Forbes.com - After earning a PH.D. in Sociolog...,forbes.com - after earning a ph.d. in sociolog...,2
3,3,No Need for OPEC to Pump More-Iran Gov,TEHRAN (Reuters) - OPEC can do nothing to dou...,tehran (reuters) - opec can do nothing to dou...,2
4,3,Shell 'could be target for Total',Oil giant Shell could be bracing itself for a ...,oil giant shell could be bracing itself for a ...,2


In [None]:
annotate_df.head()

Unnamed: 0,Class Index,Title,Description,text,labels,idx
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","reuters - short-sellers, wall street's dwindli...",2,0
1,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,reuters - soaring crude prices plus worries\ab...,2,1
2,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,reuters - authorities have halted oil export\f...,2,2
3,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","afp - tearaway world oil prices, toppling reco...",2,3
4,3,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...,reuters - stocks ended slightly higher on frid...,2,4


In [None]:
## We are going to take only 1000 randomly choosen examples from above train data to train our model
train_df = train_df_al.sample(1000, random_state=100)

In [None]:
train_df.to_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/train_1000.csv.gz", sep="|", index=False, compression="gzip")

In [None]:
train_df_6000 = train_df_al.sample(6000, random_state=100)

In [None]:
train_df_6000.to_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/train_6000.csv.gz", sep="|", index=False, compression="gzip")

In [None]:
# train_df_al.to_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/train.csv.gz", sep="|", index=False, compression='gzip')
# annotate_df.to_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/annotate.csv.gz", sep="|", index=False, compression='gzip')
# valid_df.to_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/valid.csv.gz", sep="|", index=False, compression='gzip')
# test_df.to_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/test.csv.gz", sep="|", index=False, compression='gzip')

In [None]:
## Training RoBERTa on 1000 training sample to obtain baseline accuracy on test data

# Optional model configuration

mid_model_args = ClassificationArgs(num_train_epochs=5, 
                                overwrite_output_dir= True, 
                                train_batch_size=16,
                                max_seq_length=256, 
                                wandb_project= 'active_learning_6000_20210512', 
                                best_model_dir="/content/drive/MyDrive/fsdl_project/model/active_learning/6000/best_model/20210512",
                                cache_dir="/content/drive/MyDrive/fsdl_project/cache/active_learning/6000/20210512",
                                eval_batch_size=16,
                                evaluate_during_training=True,
                                evaluate_during_training_verbose=True,
                                manual_seed=100,
                                output_dir="content/drive/MyDrive/fsdl_project/output/active_learning/6000/20210512",
                                # no_cache = True,
                                use_early_stopping=True,
                                early_stopping_patience=3,
                                )


# Create a ClassificationModel
mid_model = ClassificationModel(
"roberta", "roberta-base", args=mid_model_args, use_cuda=True, num_labels=4,
)

mid_model.train_model(train_df=train_df_6000, eval_df=valid_df, accuracy=accuracy_score)


In [None]:
test_result, test_model_outputs, test_wrong_predictions = mid_model.eval_model(test_df, accuracy = accuracy_score)
valid_result, valid_model_outputs, valid_wrong_predictions = mid_model.eval_model(valid_df, accuracy = accuracy_score)
train_result, train_model_outputs, train_wrong_predictions = mid_model.eval_model(train_df_6000, accuracy = accuracy_score)
log_to_wandb = {f'test_{key}': item for key, item in test_result.items()}
log_to_wandb.update({f'train_{key}': item for key, item in train_result.items()})
log_to_wandb.update({f'valid_{key}': item for key, item in valid_result.items()})
log_to_wandb


In [None]:
os.mkdir("/content/drive/MyDrive/fsdl_project/result/active_learning/20210512")

In [None]:
log_to_wandb

{'test_accuracy': 0.9052631578947369,
 'test_eval_loss': 0.46176427139533,
 'test_mcc': 0.8736933468410781,
 'train_accuracy': 0.9883333333333333,
 'train_eval_loss': 0.046081207289981344,
 'train_mcc': 0.9844555672177369,
 'valid_accuracy': 0.907125,
 'valid_eval_loss': 0.4330419914466329,
 'valid_mcc': 0.8761829392366339}

In [None]:
import json
with open("/content/drive/MyDrive/fsdl_project/result/active_learning/20210512/initial_train_stats_6000_20210512.json", 'w') as outfile:
    json.dump(log_to_wandb, outfile, indent=4)

In [None]:
# ## Training RoBERTa on 1000 training sample to obtain baseline accuracy on test data

# # Optional model configuration

# al_model_args = ClassificationArgs(num_train_epochs=5, 
#                                 overwrite_output_dir= True, 
#                                 train_batch_size=16,
#                                 max_seq_length=256, 
#                                 wandb_project= 'active_learning_1000_20210510', 
#                                 best_model_dir="/content/drive/MyDrive/fsdl_project/model/active_learning/1000/best_model/20210510",
#                                 cache_dir="/content/drive/MyDrive/fsdl_project/cache/active_learning/1000/20210510",
#                                 eval_batch_size=16,
#                                 evaluate_during_training=True,
#                                 evaluate_during_training_verbose=True,
#                                 manual_seed=100,
#                                 output_dir="content/drive/MyDrive/fsdl_project/output/active_learning/1000/20210510",
#                                 # no_cache = True,
#                                 use_early_stopping=True,
#                                 early_stopping_patience=3,
#                                 )


# # Create a ClassificationModel
# al_model = ClassificationModel(
# "roberta", "roberta-base", args=al_model_args, use_cuda=True, num_labels=4,
# )

# al_model.train_model(train_df=train_df, eval_df=valid_df, accuracy=accuracy_score)


In [None]:
### Initial trained model on large training data
##-----------------------------------------------###


## Training RoBERTa on truncated training set to obtain baseline accuracy on test data

# Optional model configuration

# al_model_args = ClassificationArgs(num_train_epochs=5, 
#                                 overwrite_output_dir= True, 
#                                 train_batch_size=16,
#                                 max_seq_length=256, 
#                                 wandb_project= 'active_learning_20210510', 
#                                 best_model_dir="/content/drive/MyDrive/fsdl_project/model/active_learning/best_model/20210510",
#                                 cache_dir="/content/drive/MyDrive/fsdl_project/cache/active_learning/20210510",
#                                 eval_batch_size=16,
#                                 evaluate_during_training=True,
#                                 evaluate_during_training_verbose=True,
#                                 manual_seed=100,
#                                 output_dir="content/drive/MyDrive/fsdl_project/output/active_learning/20210510",
#                                 # no_cache = True,
#                                 use_early_stopping=True,
#                                 early_stopping_patience=3,
#                                 )


# # Create a ClassificationModel
# al_model = ClassificationModel(
# "roberta", "roberta-base", args=al_model_args, use_cuda=True, num_labels=4,
# )

# al_model.train_model(train_df=train_df_al, eval_df=valid_df, accuracy=accuracy_score)

##--------------------------------

In [None]:
# ## loading model
al_model = ClassificationModel(
"roberta", al_model_args.best_model_dir
)

In [None]:
test_result, test_model_outputs, test_wrong_predictions = al_model.eval_model(test_df, accuracy = accuracy_score)
valid_result, valid_model_outputs, valid_wrong_predictions = al_model.eval_model(valid_df, accuracy = accuracy_score)
train_result, train_model_outputs, train_wrong_predictions = al_model.eval_model(train_df, accuracy = accuracy_score)

In [None]:
test_result

{'accuracy': 0.8688157894736842,
 'eval_loss': 0.42330945989803265,
 'mcc': 0.8268612843070937}

In [None]:
log_to_wandb = {f'test_{key}': item for key, item in test_result.items()}
log_to_wandb.update({f'train_{key}': item for key, item in train_result.items()})
log_to_wandb.update({f'valid_{key}': item for key, item in valid_result.items()})

In [None]:
log_to_wandb

{'test_accuracy': 0.9230263157894737,
 'test_eval_loss': 0.3959990933331612,
 'test_mcc': 0.8974095014721511,
 'train_accuracy': 0.9876736111111111,
 'train_eval_loss': 0.053259740840294396,
 'train_mcc': 0.9835782903636949,
 'valid_accuracy': 0.9295,
 'valid_eval_loss': 0.3732603291405637,
 'valid_mcc': 0.9060107576776867}

In [None]:
# os.mkdir("/content/drive/MyDrive/fsdl_project/result/")
# os.mkdir("/content/drive/MyDrive/fsdl_project/result/active_learning/")
os.mkdir("/content/drive/MyDrive/fsdl_project/result/active_learning/20210421")
# os.mkdir("/content/drive/MyDrive/fsdl_project/result/baseline")

In [None]:
import json
with open("/content/drive/MyDrive/fsdl_project/result/active_learning/20210410/initial_train_stats_1000_20210510.json", 'w') as outfile:
    json.dump(log_to_wandb, outfile, indent=4)

In [None]:
## loading annotate_df

annotate_df = pd.read_csv("/content/drive/MyDrive/fsdl_project/data/active_learning/annotate.csv.gz", sep="|", index_col=False)

In [None]:
display(annotate_df.head(2))

Unnamed: 0,Class Index,Title,Description,text,labels,idx
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","reuters - short-sellers, wall street's dwindli...",2,0
1,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,reuters - soaring crude prices plus worries\ab...,2,1


In [None]:
### Making prediction on annotation dataset and saving it to output/active_learning/20210410 for annotation
annotate_text = annotate_df['text'].values.tolist()
annotate_predictions, annotate_raw_output = al_model.predict(annotate_text)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/67200 [00:00<?, ?it/s]

Exception ignored in: <finalize object at 0x7f4d106168d0; dead>
Traceback (most recent call last):
  File "/usr/lib/python3.7/weakref.py", line 572, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "/usr/lib/python3.7/tempfile.py", line 936, in _cleanup
    _rmtree(name)
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
    onerror(os.lstat, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 483, in rmtree
    orig_st = os.lstat(path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp5y2g38ta'
Exception ignored in: <finalize object at 0x7f4d106169d0; dead>
Traceback (most recent call last):
  File "/usr/lib/python3.7/weakref.py", line 572, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "/usr/lib/python3.7/tempfile.py", line 936, in _cleanup
    _rmtree(name)
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
    onerror(os.lstat, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.p

  0%|          | 0/4200 [00:00<?, ?it/s]

Exception ignored in: <finalize object at 0x7f4d106168d0; dead>
Traceback (most recent call last):
  File "/usr/lib/python3.7/weakref.py", line 572, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "/usr/lib/python3.7/tempfile.py", line 936, in _cleanup
    _rmtree(name)
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
    onerror(os.lstat, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 483, in rmtree
    orig_st = os.lstat(path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp5y2g38ta'


In [None]:
import torch.nn as nn
import torch

In [None]:
sfm = nn.Softmax(dim=1)

In [None]:
annotate_raw_output_tensor = torch.from_numpy(annotate_raw_output)
annotate_class_prob = sfm(annotate_raw_output_tensor)
max_prob = torch.max(annotate_class_prob, dim=1)
annotate_class_prob = annotate_class_prob.numpy()
max_prob = max_prob.values.numpy()

In [None]:
annotate_df.head(2)

Unnamed: 0,Class Index,Title,Description,text,labels,idx
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","reuters - short-sellers, wall street's dwindli...",2,0
1,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,reuters - soaring crude prices plus worries\ab...,2,1


In [None]:
annotate_df_with_pred = np.hstack((
    annotate_df.idx.values.reshape(-1,1),
    annotate_df.Title.values.reshape(-1,1),
    annotate_df.Description.values.reshape(-1,1),
    annotate_df.text.values.reshape(-1,1),        
    annotate_raw_output,
    annotate_class_prob,
    max_prob.reshape(-1,1),
    np.array(annotate_predictions).reshape(-1,1)
           ))

In [None]:
col_names = ['idx', 
             'text', 
             'title',
             'description',
             'logit_0', 'logit_1', 'logit_2', 'logit_3', 
             'prob_0', 'prob_1', 'prob_2', 'prob_3',
             'max_prob',
             'label_pred'
             ]

In [None]:

annotate_df_with_pred = pd.DataFrame(annotate_df_with_pred, columns=col_names)

In [None]:
annotate_df_with_pred['annotated_labels'] = ''
annotate_df_with_pred['sampling_method'] = ''

In [None]:
annotate_df_with_pred.head()

Unnamed: 0,idx,text,title,description,logit_0,logit_1,logit_2,logit_3,prob_0,prob_1,prob_2,prob_3,max_prob,label_pred,annotated_labels,sampling_method
0,0,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","reuters - short-sellers, wall street's dwindli...",-1.64648,-1.90039,2.11523,1.56641,0.0143577,0.0111382,0.617703,0.356801,0.617703,2,,
1,1,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,reuters - soaring crude prices plus worries\ab...,0.0162964,-2.00391,3.34375,-1.1543,0.0341179,0.00452501,0.950774,0.0105828,0.950774,2,,
2,2,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,reuters - authorities have halted oil export\f...,3.80469,-1.4668,0.199097,-1.95312,0.965749,0.00496007,0.0262407,0.00304985,0.965749,0,,
3,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","afp - tearaway world oil prices, toppling reco...",1.12891,-2.0293,2.77734,-1.55762,0.158492,0.00673623,0.823976,0.010796,0.823976,2,,
4,4,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...,reuters - stocks ended slightly higher on frid...,0.172119,-1.75195,3.11914,-1.28711,0.0489535,0.00714775,0.932521,0.0113775,0.932521,2,,


In [None]:
# os.mkdir("/content/drive/MyDrive/fsdl_project/output/active_learning")
os.mkdir("/content/drive/MyDrive/fsdl_project/output/active_learning/20210510")

In [None]:
annotate_df_with_pred.to_csv("/content/drive/MyDrive/fsdl_project/output/active_learning/20210510/annotate.csv.gz", index=False,
                             compression="gzip")

In [None]:
annotate_df_with_pred.head()

In [None]:
log_to_wandb

{'test_accuracy': 0.9292105263157895,
 'test_eval_loss': 0.37604381556634936,
 'test_mcc': 0.9056558484190013,
 'train_accuracy': 0.9859375,
 'train_eval_loss': 0.06025019081414535,
 'train_mcc': 0.9812553280559401,
 'valid_accuracy': 0.9294583333333334,
 'valid_eval_loss': 0.3652915442798403,
 'valid_mcc': 0.9059680833454649}