# Mount drive


In [1]:
# If loading this from google colab
# from google.colab import drive

# drive.mount('/content/drive')

In [2]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split

import pandas as pd
import time
import datetime
import random
import numpy as np
import csv

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import RandomSampler, DataLoader, Subset
from torch.utils.data import TensorDataset, random_split, SequentialSampler

#from tqdm.notebook import tqdm
from tqdm import tqdm
# https://stackoverflow.com/questions/42212810/tqdm-in-jupyter-notebook-prints-new-progress-bars-repeatedly

tqdm.pandas()

In [3]:
# importing our helper functions
# from src.models.preprocessing_bert import *
# from src.models.BERT.helper_bert import *
# from src.models.BERT.models import *
# from src.models.BERT.train import *
# from src.helper import *

from preprocessing_bert import *
from helpers_bert import *
from models_bert import *
from train_bert import *
from helpers import *

In [4]:
!pip install transformers



In [5]:
%load_ext autoreload
%autoreload 2

# Paths & load data

In [6]:
# Paths
PATH_DATA = './data/'
PATH_PREPROCESSING = PATH_DATA + 'preprocessing/'
PATH_SRC = './src/'

In [8]:
train_pos, train_neg = load_tweets(PATH_DATA,small_dataset=1)

In [9]:
#device = gpu_cpu_setup()
device = 'cpu'

In [10]:
df = create_input_df(train_pos,train_neg)
input_ids, attention_masks, labels = tokenize_with_autoencoder(df[99000:101000],max_len=40)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 5085.34it/s]

encoding done
concatenating





In [11]:
# with train test split
full_dataset = TensorDataset(input_ids, attention_masks, labels)
train_ds, val_ds = train_val_split(full_dataset,proportion = 0.9)
train_dataloader = DataLoader(train_ds, shuffle = True, batch_size = 32)
val_dataloader = as_dataloader(val_ds, random = False)

In [12]:
# Subset of train train test split -- To test if train function works
full_dataset = TensorDataset(input_ids, attention_masks, labels)
train_ds, val_ds = train_val_split(full_dataset)
train_dataloader = DataLoader(Subset(train_ds,np.arange(64)), shuffle = True, batch_size = 32)
val_dataloader = as_dataloader(Subset(val_ds,np.arange(32)), random = False)

In [17]:
# # Only train set
# train_dataloader = DataLoader(full_dataset, shuffle=True, batch_size = 16)
# val_dataloader = None

In [17]:
model =  BertWithCustomClassifier(nb_hidden=500)
      # BertForSequenceClassification.from_pretrained("bert-base-uncased",
      #                                             num_labels = 2, 
      #                                             output_attentions = False,
      #                                             output_hidden_states = False)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertWithCustomClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768

In [18]:
epochs = 1
total_steps = len(train_dataloader) * epochs # = number of batches times epochs

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8) # trying lr = 1e-5

scheduler = get_linear_schedule_with_warmup(optimizer,  num_warmup_steps = round(total_steps*0.10), num_training_steps = total_steps)

In [19]:
training_stats = train_bert_class_with_params(train_dataloader,val_dataloader,
                                              model, optimizer, scheduler,
                                              epochs, random_seed=42,
                                              device=device,
                                              PATH_DATA=PATH_DATA,
                                              save_N_steps=100000,
                                              save_epoch=True,
                                              txt_header = 'BERT_custom_classifier_smallds',
                                              step_print=100000,
                                              validate = True,
                                              freezing = True,
                                              freez_steps = 100,
                                              frozen_epochs = 1)


Training...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.31s/it]


  Train accuracy: 0.5938

  Average training loss: 0.6928
  Training epochtook: 0:00:03

Running Validation...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.17s/it]

  Accuracy: 0.5312
  Validation Loss: 0.6879
  Validation took: 0:00:01

Training complete!
Total training took 0:00:04 (h:mm:ss)





# Submission


In [20]:
# load model from disk
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels = 2,
#                                                       output_attentions = False,
#                                                       output_hidden_states = False)
model = BertWithCustomClassifier(nb_hidden=500)
model_name='BERT/BERT_custom_classifier_smallds_epoch_0'
model.load_state_dict(torch.load(PATH_DATA+'models/'+model_name+'.pkl'))
model.eval()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertWithCustomClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768

In [21]:
df_test = pd.read_fwf(PATH_DATA + 'twitter-datasets/test_data.txt', header = None, names = ['Tweet'], colspecs = [(0,280)])
df_test.rename(columns={"Tweet": "text"},inplace=True)

In [22]:
input_ids, attention_masks = tokenize_with_autotokenizer_test(df_test, max_len=40)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4958.27it/s]


In [23]:
# using dummy labels to keep the same format
test_dataset = TensorDataset(input_ids, attention_masks, torch.ones(10000).long())

In [24]:
test_dataloader = as_dataloader(Subset(test_dataset,np.arange(64)) , batch_size = 32, random = False) 

In [25]:
y_pred, ids = make_prediction(model, test_dataloader,device)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.20s/it]


In [26]:
pred_sanity_checks(y_pred)

tensor(64)
tensor(0)
tensor(64)
tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
unique values [-1]


In [27]:
name = 'bert_custom_bigds_2ep_2elr_91eval_40lentest'
create_csv_submission(ids, y_pred, PATH_DATA + 'submissions/output_' + name + '.csv' )