In [90]:
import pandas as pd
from datasets import load_dataset
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification
from transformers import EarlyStoppingCallback

In [92]:
dataset = load_dataset("fancyzhx/yelp_polarity")

In [94]:
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

In [96]:
train_df.head()

Unnamed: 0,text,label
0,"Unfortunately, the frustration of being Dr. Go...",0
1,Been going to Dr. Goldberg for over 10 years. ...,1
2,I don't know what Dr. Goldberg was like before...,0
3,I'm writing this review to give you a heads up...,0
4,All the food is great here. But the best thing...,1


In [98]:
test_df.head()

Unnamed: 0,text,label
0,"Contrary to other reviews, I have zero complai...",1
1,Last summer I had an appointment to get new ti...,0
2,"Friendly staff, same starbucks fair you get an...",1
3,The food is good. Unfortunately the service is...,0
4,Even when we didn't have a car Filene's Baseme...,1


In [100]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560000 entries, 0 to 559999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    560000 non-null  object
 1   label   560000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 8.5+ MB


In [102]:
train_df.isnull().sum()

text     0
label    0
dtype: int64

In [104]:
train_df.sample(5)['text'].values

array(["Happy hour from 3-6, wish I would've known!",
       "I wanted to like Uni's.  The staff is friendly and the restaurant is clean.  The original order from the Sushi selection form was lost so we were asked to complete another one.   A number of items were ordered for the four of us and the food was fair.  The order was mixed up though and food we didn't order was presented to us and other items we did order never showed up.  One meal in particular (a sushi/sashimi combo) took an hour to arrive.  The Manager was apologetic and removed that meal from the bill.  I'll give them one more try and I hope they get their act together",
       'Great car collection but they refused to take their free coupon online without it being printed out which is absurd, a waste of money and trees.  Further the Imperial Palace is truly a dump, it should be torn down.  It smells like sewage and the elevators do not work well.',
       "1. Poor Server\\n2. charged for extra items\\n3. server delivered

In [106]:
def clean_text(text):
    text = text.replace('\\n', ' ') 
    text = text.replace('\n', ' ') 
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\\"', '"')
    text = text.strip()     
    return text
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

In [108]:
train_df.duplicated(subset='text').sum()

49

In [110]:
train_df = train_df.drop_duplicates(subset='text').reset_index(drop=True)

In [112]:
train_df.duplicated(subset='text').sum()

0

In [114]:
test_df.duplicated(subset='text').sum()

0

In [116]:
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [118]:

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = dataset['test']

In [120]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=256)

In [122]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/503955 [00:00<?, ? examples/s]

Map:   0%|          | 0/55996 [00:00<?, ? examples/s]

In [55]:
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

In [56]:
tokenized_train

Dataset({
    features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 503955
})

In [57]:
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_val = tokenized_val.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

In [58]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
!pip install -U transformers




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [60]:
import transformers
print(transformers.__version__)
print(transformers.TrainingArguments.__init__.__code__.co_varnames)

4.51.3
('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'use_ipex', 'bf16', 'fp16', 'fp16_opt_level', 'half_precision_ba

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch", 
    save_strategy="epoch",  
    logging_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    disable_tqdm=False, 
    report_to="none"
)

In [62]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=2) 

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average='weighted')
    precision_score = precision.compute(predictions=predictions, references=labels, average='weighted')
    recall_score = recall.compute(predictions=predictions, references=labels, average='weighted')
    
    return {
        'accuracy': acc['accuracy'],
        'f1': f1_score['f1'],
        'precision': precision_score['precision'],
        'recall': recall_score['recall']
    }


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

**Training**

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.select(range(5000)),
    eval_dataset=test_dataset.select(range(1000)),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.19919,0.928,0.927967,0.92812,0.928
2,0.243400,0.229946,0.929,0.929017,0.929185,0.929
3,0.243400,0.286207,0.935,0.934941,0.935425,0.935
4,0.072000,0.331801,0.944,0.943974,0.944138,0.944
5,0.017400,0.352874,0.941,0.941002,0.941006,0.941
6,0.017400,0.364365,0.941,0.941006,0.941029,0.941


TrainOutput(global_step=1878, training_loss=0.08946032150865743, metrics={'train_runtime': 762.857, 'train_samples_per_second': 39.326, 'train_steps_per_second': 2.462, 'total_flos': 3946665830400000.0, 'train_loss': 0.08946032150865743, 'epoch': 6.0})