In [1]:
!pip install transformers




In [2]:
!pip install accelerate -U



In [21]:
import pandas as pd
import numpy as np
from transformers import TFAutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

In [4]:
from google.colab import drive

# This will prompt you to sign in with your Google account and generate an authentication code.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/CleanedEcommerce.csv')

In [6]:
text = 'desc'
label = 'label'
num_classes = df[label].nunique()

In [7]:
df.head()

Unnamed: 0,label,desc
0,Household,paper plane design frame wall hang motiv offic...
1,Household,saf frame paint wood 30 inch x 10 inch special...
2,Household,saf textur modern art print frame paint synthe...
3,Household,saf flower print frame paint synthet 13 5 inch...
4,Household,incred gift india wooden happi birthday uniqu ...


In [8]:
df.isnull().sum()

label    0
desc     1
dtype: int64

In [9]:
df= df.dropna()

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=num_classes)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:

model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
model = model.to('cuda')

In [13]:
label_encoder = LabelEncoder()

df[label] = label_encoder.fit_transform(df[label])

In [14]:
max_length = 128

X = list(df[text])
y = list(df[label])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=max_length)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=max_length)

In [15]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [16]:
print(X_train_tokenized['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
len(X_train),len(X_val)

(22240, 5561)

In [18]:

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [19]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [20]:
train_dataset[5]

{'input_ids': tensor([  101,  7861, 22083,  5332, 18676,  3886,  3614,  2275,  2461,  2275,
          1018,  2235,  2321,  4642,  1016,  7473,  2502,  2322,  4642,  1016,
          7473,  7861, 22083,  5332, 13012, 19510,  7318, 13523,  2191,  2152,
         24209, 11475,  3775, 18676,  3886,  7812,  2173,  2980,  5383, 21628,
          2140,  3614,  2980,  5383,  5477,  8490, 14175,  6305, 11586, 21628,
          2140, 14175,  6305,  2173,  2980,  5383,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [22]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred,average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred,average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred,average='weighted')
    class_report = classification_report(y_true =labels, y_pred = pred)

    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1, "Classification Report :\n" :class_report}

In [48]:
# Define Trainer
args = TrainingArguments(
    output_dir="/content/Output",
    num_train_epochs=3,
    per_device_train_batch_size=64,

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,

)

In [49]:
trainer.train()

Step,Training Loss
500,0.2021
1000,0.0953


TrainOutput(global_step=1044, training_loss=0.14517580732075192, metrics={'train_runtime': 1384.2366, 'train_samples_per_second': 48.2, 'train_steps_per_second': 0.754, 'total_flos': 4388771212001280.0, 'train_loss': 0.14517580732075192, 'epoch': 3.0})

In [50]:
trainer.evaluate()

Trainer is attempting to log a value of "              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1251
           1       0.98      0.98      0.98      1135
           2       0.95      0.94      0.95      1062
           3       0.96      0.96      0.96      2113

    accuracy                           0.96      5561
   macro avg       0.96      0.96      0.96      5561
weighted avg       0.96      0.96      0.96      5561
" of type <class 'str'> for key "eval/Classification Report :
" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.15617042779922485,
 'eval_Accuracy': 0.9611580650962057,
 'eval_Precision': 0.9611657486265676,
 'eval_Recall': 0.9611580650962057,
 'eval_F1 Score': 0.9611597276903546,
 'eval_Classification Report :\n': '              precision    recall  f1-score   support\n\n           0       0.96      0.96      0.96      1251\n           1       0.98      0.98      0.98      1135\n           2       0.95      0.94      0.95      1062\n           3       0.96      0.96      0.96      2113\n\n    accuracy                           0.96      5561\n   macro avg       0.96      0.96      0.96      5561\nweighted avg       0.96      0.96      0.96      5561\n',
 'eval_runtime': 42.3256,
 'eval_samples_per_second': 131.386,
 'eval_steps_per_second': 16.444,
 'epoch': 3.0}