In [None]:
!nvidia-smi

Wed Apr 28 15:08:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup

We'll need [the Transformers library](https://huggingface.co/transformers/) by Hugging Face:

In [None]:
!pip install -q -U watermark

In [None]:
!pip install -qq transformers

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.10
IPython version      : 5.5.0

numpy       : 1.19.5
pandas      : 1.1.5
torch       : 1.8.1+cu101
transformers: 4.5.1



In [None]:
#@title Setup & Config
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Data Exploration

We'll load the Google Play app reviews dataset, that we've put together in the previous part:

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
ls "/content/gdrive/My Drive/SNLP project/data/community_labels"

community_labels.csv


In [None]:
!pip install jsonlines
!pip install -q -U watermark
!pip install -qq transformers



In [None]:
import csv
import time
import jsonlines
import pandas as pd
import numpy as np
import transformers
import torch

In [None]:
device = torch.device('cuda:0')

## Load data

In [None]:
PROJECT_PATH = "/content/gdrive/MyDrive/SNLP project/"
DATA_FOLDER_PATH = 'data/preprocessed_tweets/'

df_clean_sample = pd.read_csv(PROJECT_PATH + DATA_FOLDER_PATH + 'df_clean_sampled_binary.csv')

In [None]:
LR = 2e-5 # Learning rate
EPOCHS = 4
DROPOUT = 0.3
N_CLUSTERS = 2
BATCH_SIZE = 32
MAX_SEQ_LEN = 74
WARMUP_STEPS = 0

PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = transformers.BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

## Preprocess tweet content

## Grid search


In [None]:
!pip install scikit-learn==0.24.1
!pip install skorch



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import BertConfig, BertModel

class SentimentClassifier(nn.Module):

  def __init__(self, n_classes, dropout):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=dropout)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask, **kwargs):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output[1])
    return self.out(output)

class TwitterClimateSkorchDataset(Dataset):

  def __init__(self, data, tokenizer, max_len):
    self.tweets = data['content'].values
    self.clusters = data['Cluster'].values
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    tweet = str(self.tweets[item])
    cluster = self.clusters[item]

    encoding = self.tokenizer.encode_plus(
      tweet,
      #padding='max_length',
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoding['input_ids'].flatten().to(device)
    attention_mask = encoding['attention_mask'].flatten().to(device)
    clusters = torch.tensor(cluster, dtype=torch.long, device=device)

    return input_ids, attention_mask, clusters

In [None]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV
#from skorch.classifier import NeuralNetClassifier
from skorch.helper import SliceDict, SliceDataset
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# skorch & sklearn needs dataloader wrapped in SliceDataset for GridSearchCV
ds = TwitterClimateSkorchDataset(df_clean_sample, tokenizer, MAX_SEQ_LEN)
X0 = SliceDataset(ds, idx=0)
X1 = SliceDataset(ds, idx=1)
X_train = SliceDict(input_ids=X0, attention_mask=X1)
y_train = SliceDataset(ds, idx=2)

# y_train has to be fed into CPU memory to avoid errors?
y_train = torch.tensor([y for y in iter(y_train)], dtype=torch.long, device="cpu")

print(X_train.shape)
print(y_train.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(49598,)
torch.Size([49598])


In [None]:
from torch.optim import AdamW
from skorch.classifier import NeuralNetClassifier, NeuralNetBinaryClassifier

# Note: when defining arguments below, the syntax [object_name]__[argument]=[value]
# defines the parameter for the given object
# E.g. SentimentClassifier is given module__n_classes as argument

net = NeuralNetClassifier(
    module=SentimentClassifier,
    optimizer=AdamW,
    criterion=nn.CrossEntropyLoss,
    max_epochs=3,
    verbose=1,
    batch_size=BATCH_SIZE,
    train_split=None, # sklearn grid search already does this
    device="cuda",
    # optimizer args
    optimizer__lr=LR,
    # module args
    module__n_classes=2,
    module__dropout=0.3,
)

# here we define parameters to perform Grid Search on
params = {
    'module__dropout': [0.1, 0.2, 0.3, 0.4, 0.5],
    'optimizer__lr': [2e-5, 1e-5],
    'batch_size': [32, 64]
}

# refit: Refit an estimator using the best found parameters on the whole dataset
# cv: Controls number of folds with StratifiedKFold
# scoring: for classification we use accuracy, or balanced_accuracy if dealing with imbalanced dataset

# to get an estimate of cell runtime for GridSearchCV, calculate
# len(params_1) x ... x len(params_n) x cv x max_epochs x one_epoch_runtime
# with HalvingGridSearchCV its not so linear because epoch runtimes change

gs = HalvingGridSearchCV(
    net, 
    params, 
    refit=False, # no need refit using best parameters for now
    scoring='accuracy', # switch to balanced_accuracy with uneven datasets
    min_resources=1000, # resources to use in iter=0 of halving grid search
    factor=2,
    max_resources=8000, # max resources to use in iter=n of halving grid search
    cv=2,
    verbose=2
    ) 

gs.fit(X_train, y_train)

best_score, best_params = gs.best_score_, gs.best_params_

print(best_score)
print(best_params)

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 1000
max_resources_: 8000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 20
n_resources: 1000
Fitting 2 folds for each of 20 candidates, totalling 40 fits
  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.7100[0m  5.2408
      2        [36m0.6617[0m  5.2323
      3        [36m0.5844[0m  5.1951
[CV] END batch_size=32, module__dropout=0.1, optimizer__lr=2e-05; total time=  21.3s
  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.7136[0m  5.2414
      2        [36m0.6618[0m  5.2449
      3        [36m0.5463[0m  5.2392
[CV] END batch_size=32, module__dropout=0.1, optimizer__lr=2e-05; total time=  21.3s
  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.7101[0m  5.1995
      2        [36m0.6943[0m  5.2087
      3        [36m0.6412[0m  5.2112
[CV] END batch_size=32, module__drop