# Task 1: Automatic classification of tweets that report adverse effects

In [1]:
!pip install -q transformers contractions imbalanced-learn ekphrasis

[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
[K     |████████████████████████████████| 80 kB 6.9 MB/s 
[K     |████████████████████████████████| 880 kB 37.9 MB/s 
[K     |████████████████████████████████| 596 kB 32.9 MB/s 
[K     |████████████████████████████████| 77 kB 5.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 32.6 MB/s 
[K     |████████████████████████████████| 106 kB 25.4 MB/s 
[K     |████████████████████████████████| 287 kB 48.5 MB/s 
[K     |████████████████████████████████| 45 kB 2.3 MB/s 
[K     |████████████████████████████████| 53 kB 1.2 MB/s 
[?25h  Building wheel for ekphrasis (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


## 1. Import all the necessary libraries and data files

In [2]:
import numpy as np
import pandas as pd

import warnings
import torch
import torch.nn as nn
import time

from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizerFast
from transformers import RobertaTokenizerFast, RobertaModel
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None

In [3]:
train_filename = "task2_en_training.tsv"
val_filename = "task2_en_validation.tsv"

In [4]:
# Load data
train = pd.read_csv(train_filename, sep="\t")
validation = pd.read_csv(val_filename, sep="\t")

In [5]:
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

Shape of training data is (20544, 4) and validation data is (5134, 4)


In [6]:
# Train top 5 rows
train.head().style.set_caption("Task 1: Train dataset")

Unnamed: 0,tweet_id,user_id,class,tweet
0,344266386467606528,809439366,0,"depression hurts, cymbalta can help"
1,349220537903489025,323112996,0,"@jessicama20045 right, but cipro can make things much worse...and why give bayer more of your money? they already screwed you once w/ essure"
2,351421773079781378,713100330,0,@fibby1123 are you on paxil .. i need help
3,326594278472171520,543113070,0,@redicine the lamotrigine and sjs just made chaos more vengeful and sadistic.
4,345567138376994816,138795534,0,"have decided to skip my #humira shot today. my body's having hysterics, need time to simmer down #rheum"


## 2. Prepare the data - Clean & Prepare for Model

In [7]:
# Drop unwanted columns
train.drop(['tweet_id', 'user_id'], axis=1, inplace=True)
validation.drop(['tweet_id', 'user_id'], axis=1, inplace=True)

In [8]:
# Referred from: https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


In [9]:
train['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in train.tweet]
validation['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in validation.tweet]

In [10]:
# Train top 5 rows after pre-processing
train[['class', 'clean_tweets']].head()

Unnamed: 0,class,clean_tweets
0,0,"depression hurts , cymbalta can help"
1,0,"<user> right , but cipro can make things much worse . <repeated> and why give bayer more of your money ? they already screwed you once w / essure"
2,0,<user> are you on paxil . <repeated> i need help
3,0,<user> the lamotrigine and sjs just made chaos more vengeful and sadistic .
4,0,"have decided to skip my <hashtag> humira </hashtag> shot today . my body ' s having hysterics , need time to simmer down <hashtag> rheum </hashtag>"


In [11]:
BATCH_SIZE = 32
N_EPOCHS = 5

In [12]:
# Define BERT tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [13]:
# Tokenize train and validation data
train_enc_rob = tokenizer.batch_encode_plus(train.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")
valid_enc_rob = tokenizer.batch_encode_plus(validation.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")

In [14]:
train_enc_rob.keys()

dict_keys(['input_ids', 'attention_mask'])

In [15]:
train_enc_rob.input_ids.shape, train_enc_rob.attention_mask.shape

(torch.Size([20544, 128]), torch.Size([20544, 128]))

In [16]:
def get_dataloader_rob(encoding, target):
    data = (TensorDataset(encoding.input_ids, encoding.attention_mask, target))
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

In [17]:
train_dataloader_rob = get_dataloader_rob(train_enc_rob, torch.tensor(train['class'].to_list()))
valid_dataloader_rob = get_dataloader_rob(valid_enc_rob, torch.tensor(validation['class'].to_list()))

In [18]:
# Sanity check that the tensors returned by the dataloader are correct
for batch in train_dataloader_rob:
    input_ids, attn_mask, target = batch
    print(input_ids.shape, attn_mask.shape, target.shape)
    break

torch.Size([32, 128]) torch.Size([32, 128]) torch.Size([32])


## 3. Model Building - Roberta

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [20]:
class ROBERTAclassifier(nn.Module):
    def __init__(self, transformer):
        super(ROBERTAclassifier, self).__init__()
        self.transformer = transformer
        self.linear_layer = nn.Linear(768, 2)
    
    def forward(self, ip_ids, attn_mask):
        op = self.transformer(input_ids=ip_ids,
                              attention_mask=attn_mask)
        return  self.linear_layer(op["pooler_output"])

In [21]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [22]:
transformer_rob = RobertaModel.from_pretrained("roberta-base")
model_rob = ROBERTAclassifier(transformer_rob).to(device)
print(f"The model has {count_parameter(model_rob)} trainable parameters.")

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The model has 124647170 trainable parameters.


In [23]:
# Define optimizer 
criterion_rob = torch.nn.CrossEntropyLoss()
optim_rob = torch.optim.AdamW(model_rob.parameters(), lr = 2e-5)

In [None]:
def train_model_rob(model, dataloader, clip=1.0):
    model.train()

    epoch_loss = 0
    batch_num = 0
    pred, target = [], []

    for index, batch in tqdm(enumerate(dataloader)):
        batch = tuple(row.to(device) for row in batch)
        input_ids, attn_mask, y = batch

        optim_rob.zero_grad()
        output = model(input_ids, attn_mask)
        loss = criterion_rob(output, y)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim_rob.step()

        epoch_loss += loss.item()
        batch_num += 1
        pred.extend(torch.argmax(output, -1).tolist())
        target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred)

def evaluate_rob(model, dataloader):
    model.eval()

    epoch_loss = 0
    batch_num = 0
    pred, target = list(), list()

    for index, batch in enumerate(dataloader):
        batch = tuple(row.to(device) for row in batch)
        input_ids, attn_mask, y = batch
        
        with torch.no_grad():
            output = model(input_ids, attn_mask)
            loss = criterion_rob(output, y)
            
            epoch_loss += loss.item()
            batch_num += 1
            pred.extend(torch.argmax(output, -1).tolist())
            target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred), pred, target

In [None]:
best_valid_loss_rob = float('inf')
total_train_loss_rob, total_valid_loss_rob = list(), list()

In [None]:
for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_f1_score = train_model_rob(model_rob, train_dataloader_rob)
    total_train_loss_rob.append(train_loss)

    valid_loss, valid_f1_score, pred, target = evaluate_rob(model_rob, valid_dataloader_rob)
    total_valid_loss_rob.append(valid_loss)

    if valid_loss < best_valid_loss_rob:
        best_valid_loss_rob = valid_loss
        best_pred, best_target = pred, target
        torch.save(model_rob.state_dict(), "model_least_loss_rob.pt")
        print("\nBest Model Saved!!\n")
    
    torch.save(model_rob.state_dict(), "model_checkpoint_rob" + str(epoch) + ".pt")
    print("Checkpoint Model Saved!\n")

    print(f"Epoch: {epoch+1:02}")
    print(f"Train Total Loss: {train_loss:.3f} | Train F1 Score: {train_f1_score:.3f}")
    print(f"Valid Total Loss: {valid_loss:.3f} | Valid F1 Score: {valid_f1_score:.3f}")
    print("-"*20)

  0%|          | 0/5 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  1.28it/s][A
2it [00:01,  1.43it/s][A
3it [00:02,  1.47it/s][A
4it [00:02,  1.50it/s][A
5it [00:03,  1.52it/s][A
6it [00:04,  1.53it/s][A
7it [00:04,  1.54it/s][A
8it [00:05,  1.55it/s][A
9it [00:05,  1.54it/s][A
10it [00:06,  1.54it/s][A
11it [00:07,  1.54it/s][A
12it [00:07,  1.54it/s][A
13it [00:08,  1.54it/s][A
14it [00:09,  1.53it/s][A
15it [00:09,  1.53it/s][A
16it [00:10,  1.53it/s][A
17it [00:11,  1.52it/s][A
18it [00:11,  1.53it/s][A
19it [00:12,  1.52it/s][A
20it [00:13,  1.52it/s][A
21it [00:13,  1.53it/s][A
22it [00:14,  1.52it/s][A
23it [00:15,  1.52it/s][A
24it [00:15,  1.52it/s][A
25it [00:16,  1.52it/s][A
26it [00:17,  1.51it/s][A
27it [00:17,  1.51it/s][A
28it [00:18,  1.51it/s][A
29it [00:19,  1.52it/s][A
30it [00:19,  1.51it/s][A
31it [00:20,  1.50it/s][A
32it [00:21,  1.50it/s][A
33it [00:21,  1.50it/s][A
34it [00:22,  1.50it/s][A
35it [00:23,  1.49it/s][A
36it


Best Model Saved!!



 20%|██        | 1/5 [07:45<31:00, 465.07s/it]

Checkpoint Model Saved!

Epoch: 01
Train Total Loss: 0.216 | Train F1 Score: 0.452
Valid Total Loss: 0.251 | Valid F1 Score: 0.571
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.40it/s][A
2it [00:01,  1.48it/s][A
3it [00:02,  1.50it/s][A
4it [00:02,  1.50it/s][A
5it [00:03,  1.51it/s][A
6it [00:03,  1.52it/s][A
7it [00:04,  1.51it/s][A
8it [00:05,  1.51it/s][A
9it [00:05,  1.51it/s][A
10it [00:06,  1.51it/s][A
11it [00:07,  1.52it/s][A
12it [00:07,  1.52it/s][A
13it [00:08,  1.52it/s][A
14it [00:09,  1.51it/s][A
15it [00:09,  1.52it/s][A
16it [00:10,  1.52it/s][A
17it [00:11,  1.51it/s][A
18it [00:11,  1.51it/s][A
19it [00:12,  1.51it/s][A
20it [00:13,  1.51it/s][A
21it [00:13,  1.51it/s][A
22it [00:14,  1.52it/s][A
23it [00:15,  1.51it/s][A
24it [00:15,  1.51it/s][A
25it [00:16,  1.50it/s][A
26it [00:17,  1.51it/s][A
27it [00:17,  1.50it/s][A
28it [00:18,  1.51it/s][A
29it [00:19,  1.50it/s][A
30it [00:19,  1.50it/s][A
31it [00:20,  1.50it/s][A
32it [00:21,  1.51it/s][A
33it [00:21,  1.50it/s][A
34it [00:22,  1.50it/s][A
35it [00:23,  1.50it/s][A
36it [00:23,  1.50it/s][A
37it [00:24,  


Best Model Saved!!



 40%|████      | 2/5 [15:29<23:14, 464.87s/it]

Checkpoint Model Saved!

Epoch: 02
Train Total Loss: 0.144 | Train F1 Score: 0.701
Valid Total Loss: 0.214 | Valid F1 Score: 0.632
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.40it/s][A
2it [00:01,  1.47it/s][A
3it [00:02,  1.49it/s][A
4it [00:02,  1.50it/s][A
5it [00:03,  1.50it/s][A
6it [00:04,  1.51it/s][A
7it [00:04,  1.51it/s][A
8it [00:05,  1.52it/s][A
9it [00:05,  1.52it/s][A
10it [00:06,  1.51it/s][A
11it [00:07,  1.51it/s][A
12it [00:07,  1.51it/s][A
13it [00:08,  1.51it/s][A
14it [00:09,  1.51it/s][A
15it [00:09,  1.51it/s][A
16it [00:10,  1.51it/s][A
17it [00:11,  1.51it/s][A
18it [00:11,  1.51it/s][A
19it [00:12,  1.51it/s][A
20it [00:13,  1.51it/s][A
21it [00:13,  1.51it/s][A
22it [00:14,  1.51it/s][A
23it [00:15,  1.51it/s][A
24it [00:15,  1.51it/s][A
25it [00:16,  1.51it/s][A
26it [00:17,  1.51it/s][A
27it [00:17,  1.50it/s][A
28it [00:18,  1.50it/s][A
29it [00:19,  1.51it/s][A
30it [00:19,  1.51it/s][A
31it [00:20,  1.51it/s][A
32it [00:21,  1.51it/s][A
33it [00:21,  1.51it/s][A
34it [00:22,  1.51it/s][A
35it [00:23,  1.51it/s][A
36it [00:23,  1.50it/s][A
37it [00:24,  

Checkpoint Model Saved!

Epoch: 03
Train Total Loss: 0.104 | Train F1 Score: 0.791
Valid Total Loss: 0.229 | Valid F1 Score: 0.604
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.56it/s][A
2it [00:01,  1.52it/s][A
3it [00:01,  1.52it/s][A
4it [00:02,  1.52it/s][A
5it [00:03,  1.52it/s][A
6it [00:03,  1.52it/s][A
7it [00:04,  1.51it/s][A
8it [00:05,  1.51it/s][A
9it [00:05,  1.52it/s][A
10it [00:06,  1.51it/s][A
11it [00:07,  1.51it/s][A
12it [00:07,  1.51it/s][A
13it [00:08,  1.51it/s][A
14it [00:09,  1.51it/s][A
15it [00:09,  1.51it/s][A
16it [00:10,  1.51it/s][A
17it [00:11,  1.50it/s][A
18it [00:11,  1.50it/s][A
19it [00:12,  1.50it/s][A
20it [00:13,  1.50it/s][A
21it [00:13,  1.50it/s][A
22it [00:14,  1.50it/s][A
23it [00:15,  1.50it/s][A
24it [00:15,  1.50it/s][A
25it [00:16,  1.50it/s][A
26it [00:17,  1.51it/s][A
27it [00:17,  1.51it/s][A
28it [00:18,  1.51it/s][A
29it [00:19,  1.51it/s][A
30it [00:19,  1.51it/s][A
31it [00:20,  1.51it/s][A
32it [00:21,  1.51it/s][A
33it [00:21,  1.50it/s][A
34it [00:22,  1.50it/s][A
35it [00:23,  1.51it/s][A
36it [00:23,  1.51it/s][A
37it [00:24,  

Checkpoint Model Saved!

Epoch: 04
Train Total Loss: 0.077 | Train F1 Score: 0.880
Valid Total Loss: 0.244 | Valid F1 Score: 0.632
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.55it/s][A
2it [00:01,  1.52it/s][A
3it [00:01,  1.52it/s][A
4it [00:02,  1.52it/s][A
5it [00:03,  1.52it/s][A
6it [00:03,  1.52it/s][A
7it [00:04,  1.51it/s][A
8it [00:05,  1.51it/s][A
9it [00:05,  1.51it/s][A
10it [00:06,  1.51it/s][A
11it [00:07,  1.51it/s][A
12it [00:07,  1.51it/s][A
13it [00:08,  1.51it/s][A
14it [00:09,  1.51it/s][A
15it [00:09,  1.51it/s][A
16it [00:10,  1.51it/s][A
17it [00:11,  1.51it/s][A
18it [00:11,  1.50it/s][A
19it [00:12,  1.50it/s][A
20it [00:13,  1.51it/s][A
21it [00:13,  1.51it/s][A
22it [00:14,  1.50it/s][A
23it [00:15,  1.50it/s][A
24it [00:15,  1.50it/s][A
25it [00:16,  1.51it/s][A
26it [00:17,  1.51it/s][A
27it [00:17,  1.51it/s][A
28it [00:18,  1.50it/s][A
29it [00:19,  1.50it/s][A
30it [00:19,  1.50it/s][A
31it [00:20,  1.50it/s][A
32it [00:21,  1.50it/s][A
33it [00:21,  1.50it/s][A
34it [00:22,  1.50it/s][A
35it [00:23,  1.50it/s][A
36it [00:23,  1.49it/s][A
37it [00:24,  

Checkpoint Model Saved!

Epoch: 05
Train Total Loss: 0.054 | Train F1 Score: 0.920
Valid Total Loss: 0.376 | Valid F1 Score: 0.591
--------------------





In [None]:
print(classification_report(best_target, best_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      4660
           1       0.58      0.70      0.63       474

    accuracy                           0.92      5134
   macro avg       0.77      0.82      0.80      5134
weighted avg       0.93      0.92      0.93      5134



In [None]:
!zip -r content.zip ../content/ 

  adding: ../content/ (stored 0%)
  adding: ../content/.config/ (stored 0%)
  adding: ../content/.config/.feature_flags_config.yaml (deflated 23%)
  adding: ../content/.config/gce (stored 0%)
  adding: ../content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: ../content/.config/.metricsUUID (stored 0%)
  adding: ../content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: ../content/.config/config_sentinel (stored 0%)
  adding: ../content/.config/logs/ (stored 0%)
  adding: ../content/.config/logs/2022.04.19/ (stored 0%)
  adding: ../content/.config/logs/2022.04.19/14.22.58.550404.log (deflated 86%)
  adding: ../content/.config/logs/2022.04.19/14.23.29.232715.log (deflated 54%)
  adding: ../content/.config/logs/2022.04.19/14.23.28.447853.log (deflated 55%)
  adding: ../content/.config/logs/2022.04.19/14.22.14.343189.log (deflated 91%)
  adding: ../content/.config/logs/2022.04.19/14.23.07.556024.log (deflated 54%)
  adding: ../content/.config/logs/2022.04.19/14.22.38.448952

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from glob import glob

In [None]:
for filepath in glob("*.pt"):
    !cp -r $filepath /content/gdrive/My\ Drive/Colab\ Notebooks/NLP\ Final\ Project/
    time.sleep(10)

In [None]:
# !cp -r model_least_loss_rob.pt /content/gdrive/My\ Drive/Colab\ Notebooks/NLP\ Final\ Project/

In [None]:
# with open('/content/gdrive/My Drive/', 'w') as handle:
#     handle.write()

In [24]:
# Loading the saved model
output_model = 'model_least_loss_rob.pt'

model_test = ROBERTAclassifier(transformer_rob).to(device)
model_test.load_state_dict(torch.load(output_model, map_location=device))

<All keys matched successfully>

In [None]:
# Training set

# Set model to evaluation
# model_test.eval()

# y_pred_train, y_true_train = [], []
# train_indexes_list = []

# for index, batch in enumerate(train_dataloader_rob):
#     batch = tuple(row.to(device) for row in batch)
#     input_ids, attn_mask, target, indexes = batch
    
#     with torch.no_grad():
#         output = model_test(input_ids, attn_mask)
        
#         y_pred_train.extend(torch.argmax(output, -1).tolist())
#         y_true_train.extend(target.tolist())
#         train_indexes_list.extend(indexes.tolist())

In [None]:
# print(f"F1-score: {f1_score(y_true_train, y_pred_train)}\n", f"Classification report: \n{classification_report(y_true_train, y_pred_train)}", sep='\n')

F1-score: 0.9996512033484479

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       739
           1       1.00      1.00      1.00      1433

    accuracy                           1.00      2172
   macro avg       1.00      1.00      1.00      2172
weighted avg       1.00      1.00      1.00      2172



In [26]:
# Validation set
model_test.eval()

y_pred_valid, y_true_valid = [], []
valid_indexes_list = []

for index, batch in enumerate(valid_dataloader_rob):
    batch = tuple(row.to(device) for row in batch)
    input_ids, attn_mask, target = batch
    
    with torch.no_grad():
        output = model_test(input_ids, attn_mask)
        
        y_pred_valid.extend(torch.argmax(output, -1).tolist())
        y_true_valid.extend(target.tolist())
        # valid_indexes_list.extend(indexes.tolist())

In [27]:
print(f"F1-score: {f1_score(y_true_valid, y_pred_valid)}\n", f"Classification report: \n{classification_report(y_true_valid, y_pred_valid)}", sep='\n')

F1-score: 0.6323809523809524

Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      4660
           1       0.58      0.70      0.63       474

    accuracy                           0.92      5134
   macro avg       0.77      0.82      0.80      5134
weighted avg       0.93      0.92      0.93      5134



In [29]:
validation['pred'] = y_pred_valid

In [35]:
pd.options.display.max_rows=None

In [36]:
validation[validation['class'] != validation.pred]

Unnamed: 0,class,tweet,clean_tweets,pred
2,0,"@doctorchristian scared to start fluoxetine, what's you're opinion on them? xx","<user> scared to start fluoxetine , what ' s you are opinion on them ? xx",1
5,1,"""u wailed all night; now y'r disembodied sobbing all damn'd day, ghost?"" • effexor-xr®discontinuation syndrome ain't exactly heaven, angel",""" u wailed all night ; now y ' r disembodied sobbing all damn ' d day , ghost ? "" • effexor - xr ® discontinuation syndrome ain ' t exactly heaven , angel",0
7,1,"not that anyone noticed, but my #ambienwithdrawl only lasted a few days. why? because i got another scrip. i need it while i'm on levaquin.","not that anyone noticed , but my <hashtag> ambien withdrawl </hashtag> only lasted a few days . why ? because i got another scrip . i need it while i am on levaquin .",0
21,1,i did it. it hurt quite a bit more this time. but i did it. #humira #rheum #spoonie,i did it . it hurt quite a bit more this time . but i did it . <hashtag> humira </hashtag> <hashtag> rheum </hashtag> <hashtag> spoonie </hashtag>,0
22,1,"rt @ianibbo: finding out i'm allergic to fluoxetine was a bit of a shock tho. mind, only someone from yorkshire could be actually allergic …","rt <user> : finding out i am allergic to fluoxetine was a bit of a shock tho . mind , only someone from yorkshire could be actually allergic …",0
25,1,"@thekatieduke pt with similar found out trazodone can do the same. md said it was common. ""trazodone gives you a bone"". he is a poet.","<user> pt with similar found out trazodone can do the same . md said it was common . "" trazodone gives you a bone "" . he is a poet .",0
26,1,@thebloggess your lucky! i have to do infusions of remicade for my ra. i would happily do humera or enbrel if it didnt make me stop breathin,<user> your lucky ! i have to do infusions of remicade for my ra . i would happily do humera or enbrel if it didnt make me stop breathin,0
34,1,off to see the gi consultant this week. hope theres something other than humira to try as not working also hair falling out.,off to see the gi consultant this week . hope theres something other than humira to try as not working also hair falling out .,0
56,0,#prozac come and help me please!,<hashtag> prozac </hashtag> come and help me please !,1
57,0,@anorexic0 @ewdustin -generic drugs as effective as amgen's enbrel for arthritis- @fiinging -: a blockbuster drug made b... @jesusislana,<user> <user> - generic drugs as effective as amgen ' s enbrel for arthritis - <user> - : a blockbuster drug made b . <repeated> <user>,1


In [37]:
validation[validation['class'] == validation.pred]

Unnamed: 0,class,tweet,clean_tweets,pred
0,0,i don't fucking need humira,i do not fucking need humira,0
1,0,"my retake is next friday, if i bloody fail again then my doc is going to need to up my fluoxetine and amitriptyline dosages again! :l","my retake is next friday , if i bloody fail again then my doc is going to need to up my fluoxetine and amitriptyline dosages again ! : l",0
3,0,"@intuitivegal1 ok, if you stopped taking the lamictal, give 90mg a week.","<user> ok , if you stopped taking the lamictal , give 9 0 mg a week .",0
4,0,novartis announces secukinumab (ain457) demonstrated superiority to enbrel® in head-to-head phase iii psoriasis study .…,novartis announces secukinumab ( ain457 ) demonstrated superiority to enbrel ® in head - to - head phase i <elongated> psoriasis study . …,0
6,0,@irapaps you're so fucking selfish. i've got lamotrigine! no more fat pills :),<user> you are so fucking selfish . i have got lamotrigine ! no more fat pills <happy>,0
8,0,"@netnewsbuzz yes dear,now take ur prozac and call for knaidels","<user> yes dear , now take ur prozac and call for knaidels",0
9,0,"popular antidepressants zoloft, prozac and paxil can lower libido and prevent orgasms","popular antidepressants zoloft , prozac and paxil can lower libido and prevent orgasms",0
10,0,oh. i thought the cymbalta commercial was a joke honestly...,oh . i thought the cymbalta commercial was a joke honestly . <repeated>,0
11,0,@c4dispatches eeeeek. just chucked my victoza in the bin. i will take my chances with the diabetes #diabetes,<user> ek <elongated> . just chucked my victoza in the bin . i will take my chances with the diabetes <hashtag> diabetes </hashtag>,0
12,0,at the cypriot art and archaeology exhibition at the @nicholsonmuseum. love the cypro-archaic bird jugs! never seen anything like them.,at the cypriot art and archaeology exhibition at the <user> . love the cypro - archaic bird jugs ! never seen anything like them .,0


In [None]:
# training.loc[train_indexes_list, 'ADR'] = y_pred_train
# validation.loc[valid_indexes_list, 'ADR'] = y_pred_valid

In [None]:
# training.to_csv("training_data_with_ADR.csv")
# validation.to_csv("validation_data_with_ADR.csv")

In [None]:
# Cross-check that it is correctly mapped
# print(f"F1-score: {f1_score(training.label, training.ADR)}\n", f"Classification report: \n{classification_report(training.label, training.ADR)}", sep='\n')

F1-score: 0.9996512033484479

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       739
           1       1.00      1.00      1.00      1433

    accuracy                           1.00      2172
   macro avg       1.00      1.00      1.00      2172
weighted avg       1.00      1.00      1.00      2172

