In [16]:
# follow the tutorial in the hugging face textbook for classification,
# then turn the process of tokenization, training and testing into a function,
# such that the only variable that needs to be passed is the dataset  


import torch
import wget
import os

import pandas as pd
import zipfile

from datasets import load_dataset



#### Get Download Data

In [14]:
emotions = load_dataset("emotion")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

### Parse Downloaded Data

In [7]:
df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
4690,ks08,0,*,Loren was relied on by Pavarotti and Hepburn o...
3624,ks08,1,,John looked up the inside of the chimney.
7749,ad03,1,,Who is it obvious that Plato loves.
248,cj99,0,*,"He gets angry, the longer John has to wait."
909,bc01,0,*,"My uncle didn't buy anything for Christmas, bu..."
2515,l-93,0,*,Sharon brought Willa breathless.
944,bc01,1,,We like our friends and they do too.
1552,r-67,1,,"Tom washed, and Dick waxed, and Harry polished..."
5758,c_13,1,,The extremely tired students hated syntactic t...
7587,sks13,0,*,What they saw is Bill leave.


In [None]:
sentences = df.sentence.values
labels = df.label.values

from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
text = "Tokenizing text is a core task of NLP"
encoded_text = tokenizer(text)

tokenizer.decode(encoded_text['input_ids']) , tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])

('[CLS] tokenizing text is a core task of nlp [SEP]',
 ['[CLS]',
  'token',
  '##izing',
  'text',
  'is',
  'a',
  'core',
  'task',
  'of',
  'nl',
  '##p',
  '[SEP]'])

In [12]:
tokenizer.model_max_length

512

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

def tokenize2(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)



In [15]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [17]:
emotions.set_format(type="pandas")

In [19]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

Unnamed: 0,sentence_source,label,label_notes,sentence,label_name
0,gj04,1,,"Our friends won't buy this analysis, let alone...",joy
1,gj04,1,,One more pseudo generalization and I'm giving up.,joy
2,gj04,1,,One more pseudo generalization or I'm giving up.,joy
3,gj04,1,,"The more we study verbs, the crazier they get.",joy
4,gj04,1,,Day by day the facts are getting murkier.,joy


In [21]:
# create a whole function/tab that takes in a dataset and model_checkpoint, and returns predictions.
# This way i parttion the proces and thus isolate the problem if it does exist with the data.
# I expect the example i have like the tutorial to have a 80% to 90% accuracy

# skip straight to using the Trainer API.
