#**Transformers Approach**
*State-of-the-art Machine Learning for PyTorch, TensorFlow and JAX.*
*Transformers provides APIs to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you time from training a model from scratch. The models can be used across different modalities.*

In [None]:
! pip install transformers[sentencepiece] -Uqq

In [None]:
! pip install pandas
! pip install numpy
! pip install matplotlib
! pip install scikit-learn

In [None]:
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding

In [None]:
import pandas as pd

*Add the file name of the dataset*

In [None]:
file_name = '<file_name>.csv'

In [None]:
dataset = pd.read_csv(file_name)

In [None]:
import matplotlib.pyplot as plt
dataset['labels'].hist()

In [None]:
dataset.count()

feedback    3737
labels      3737
dtype: int64

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
def preprocess(dataset,data_column):
  """Performs data preprocessing on the given dataset."""
  dataset[f"{data_column}_processed"] = dataset[data_column].apply(lambda x: " ".join(x.lower() for x in x.split()))
  dataset[f"{data_column}_processed"]  = dataset[f"{data_column}_processed"].str.replace('[^\w\s]','')
  return dataset

In [None]:
dataset = preprocess(dataset, 'feedback')

*To remove null rows*

In [None]:
dataset = dataset.dropna(how='any')

*At least 2 words per review*

In [None]:
dataset = dataset.loc[(dataset['feedback_processed'].str.count(' ') > 1)]

In [None]:
dataset = dataset.reset_index(drop=True)

In [None]:
dataset.count()

feedback              3273
labels                3273
feedback_processed    3273
dtype: int64

In [None]:
dataset['labels'].hist()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dataset['labels'] = label_encoder.fit_transform(dataset['labels'])

In [None]:
dataset['labels'].count()

3273

In [None]:
import pyarrow as pa
s = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in s.split(dataset, dataset['labels']):
  train_set = dataset.loc[train_index]
  test_set = dataset.loc[test_index]

In [None]:
train_set.hist()
test_set.hist()

In [None]:
!pip install folium==0.2.1 -Uqq

[?25l[K     |████▊                           | 10 kB 32.5 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 38.4 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 41.5 MB/s eta 0:00:01[K     |██████████████████▊             | 40 kB 45.5 MB/s eta 0:00:01[K     |███████████████████████▍        | 51 kB 36.7 MB/s eta 0:00:01[K     |████████████████████████████    | 61 kB 40.4 MB/s eta 0:00:01[K     |████████████████████████████████| 69 kB 7.9 MB/s 
[?25h  Building wheel for folium (setup.py) ... [?25l[?25hdone


In [None]:
! pip install datasets -Uqq

[K     |████████████████████████████████| 325 kB 36.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.1 MB/s 
[K     |████████████████████████████████| 136 kB 73.8 MB/s 
[K     |████████████████████████████████| 212 kB 67.4 MB/s 
[K     |████████████████████████████████| 127 kB 74.5 MB/s 
[K     |████████████████████████████████| 94 kB 4.6 MB/s 
[K     |████████████████████████████████| 271 kB 68.0 MB/s 
[K     |████████████████████████████████| 144 kB 69.3 MB/s 
[?25h

In [None]:
import datasets
d = datasets.DatasetDict({"train":datasets.Dataset(pa.Table.from_pandas(train_set)),"test":datasets.Dataset(pa.Table.from_pandas(test_set))})


In [None]:
d.reset_format()

In [None]:
checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["feedback_processed"], truncation=True)


tokenized_datasets = d.map(tokenize_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['feedback', 'labels', 'feedback_processed', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 2618
    })
    test: Dataset({
        features: ['feedback', 'labels', 'feedback_processed', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 655
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["feedback", "__index_level_0__", "feedback_processed"])
tokenized_datasets.set_format("torch")

In [None]:
tokenized_datasets['train'][0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([  101, 14044,  2007,  2832,  1998,  3274,  2147,   102]),
 'labels': tensor(2)}

In [None]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]


In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score
from datasets import load_metric

In [None]:
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Hyper parameters for training
*if not provided default values are used*
[more info ](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)

1. learning_rate
2. num_train_epochs
3. per_device_train_batch_size
4. seed
5. optim="adamw_torch" (default optimizer "adamw_hf")





In [None]:
training_args = TrainingArguments("test_trainer",
                                  learning_rate=5.691013656357132e-06,
                                  num_train_epochs=5,
                                  per_device_train_batch_size=8,
                                  seed=36,
                                  optim="adamw_torch"
                                  )

*The Trainer class provides an API for feature-complete training in PyTorch for most standard use cases [more info](https://huggingface.co/docs/transformers/main_classes/trainer)*

In [None]:
trainer = Trainer( 
                  model=model,
                  args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=full_train_dataset,
                  eval_dataset=full_eval_dataset,
                  tokenizer=tokenizer,
                  data_collator=data_collator
                  )
trainer.train();

*Runs the evaluation against the test set*

In [None]:
trainer.evaluate()

*To save the trained model weights to current environment*

In [None]:
trainer.save_model()

*Load the trained model from the current environment*

In [None]:
from transformers import pipeline 
classifier = pipeline('sentiment-analysis', model='test_trainer')

In [None]:
import string
def preprocess_incoming(str_data):
  return str_data.translate(str.maketrans('', '', string.punctuation)).lower()

In [None]:
def predict(string):
  data = preprocess_incoming(string)
  prediction = classifier(string)
  if prediction[0]['label'] == "LABEL_1":
    # return "NEUTRAL"
    return 1
  elif prediction[0]['label'] == "LABEL_0":
    # return "NEGATIVE"
    return 0
  elif prediction[0]['label'] == "LABEL_2":
    # return "POSITIVE"
    return 2
  

*Loading a validation dataset*

In [None]:
VALIDATION_SET = pd.read_csv("First dataset.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
VALIDATION_SET['labels'] = label_encoder.fit_transform(VALIDATION_SET['labels'])
y_valid = VALIDATION_SET['labels']
y_valid = y_valid.tolist()

In [None]:
y_pred = []
for i in VALIDATION_SET['feedback']:
  y_pred.append(predict(i))

*Validation accuracy*

In [None]:
accuracy_score(y_valid, y_pred)

0.8606965174129353

In [None]:
classifier(["good experience", "bad experience", "normal"])