In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%%shell
pip install -U accelerate
pip install -U tranformers
pip install datasets evaluate

In [None]:
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoModelForSequenceClassification, TrainingArguments
import pandas as pd

# Fine tuning custom model on BeRT

In [None]:
df = pd.read_csv("./gdrive/MyDrive/data/tweets_preprocessed.csv")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,label
0,cb774db0d1,"i`d have responded, if i were going","I`d have responded, if I were going",neutral,1
1,549e992a42,sooo sad i will miss you here in san diego!!!,Sooo SAD,negative,0
2,088c60f138,my boss is bullying me...,bullying me,negative,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,0
4,358bd9e861,"sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0


In [None]:
df.dropna(subset=['text'], inplace=True)

In [None]:
df['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder

lc = LabelEncoder()
df['label'] = lc.fit_transform(df['sentiment'])

print(lc.classes_)
df

['negative' 'neutral' 'positive']


Unnamed: 0,textID,text,selected_text,sentiment,label
0,cb774db0d1,"i`d have responded, if i were going","I`d have responded, if I were going",neutral,1
1,549e992a42,sooo sad i will miss you here in san diego!!!,Sooo SAD,negative,0
2,088c60f138,my boss is bullying me...,bullying me,negative,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,0
4,358bd9e861,"sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0
...,...,...,...,...,...
27475,4eac33d1c0,wish we could come see you on denver husband...,d lost,negative,0
27476,4f4c4fc327,i`ve wondered about rake to. the client has ...,", don`t force",negative,0
27477,f67aae2310,yay good for both of you. enjoy the break - y...,Yay good for both of you.,positive,2
27478,ed167662a5,but it was worth it ****.,But it was worth it ****.,positive,2


In [None]:
tokenizer = AutoTokenizer.from_pretrained("MarieAngeA13/Sentiment-Analysis-BERT")
trained_model = AutoModelForSequenceClassification.from_pretrained("MarieAngeA13/Sentiment-Analysis-BERT", num_labels=3)


In [None]:
trained_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
def tokenize_func(text):
    return tokenizer(text["text"], padding="max_length", max_length = 128, truncation=True, return_tensors="tf")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=.3, random_state=42)

In [None]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds["train"] = Dataset.from_pandas(pd.DataFrame(zip(X_train, y_train), columns=["text", "label"]))
ds["test"] = Dataset.from_pandas(pd.DataFrame(zip(X_test, y_test), columns=["text", "label"]))

ds["train"][10]

{'text': "ninja sushi for lunch but dominic's was out of sour gummy worms ",
 'label': 1}

In [None]:
ds_t = ds.map(tokenize_func, batched = True)

ds_t

Map:   0%|          | 0/19236 [00:00<?, ? examples/s]

Map:   0%|          | 0/8244 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 19236
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8244
    })
})

In [None]:
import numpy as np
# from datasets import load_metric
import evaluate


def compute_metrics(eval_pred):

   eval = evaluate.load("accuracy")
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)

   return eval.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
        evaluation_strategy="epoch", num_train_epochs = 3)

trainer = Trainer(
    model= trained_model,
    args=training_args,
    train_dataset=ds_t["train"],
    eval_dataset=ds_t["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5807,0.564432,0.788088
2,0.3989,0.649336,0.804828
3,0.2374,0.896406,0.803372


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=7215, training_loss=0.41005968132204096, metrics={'train_runtime': 1766.677, 'train_samples_per_second': 32.665, 'train_steps_per_second': 4.084, 'total_flos': 3795937277561856.0, 'train_loss': 0.41005968132204096, 'epoch': 3.0})

In [None]:
trainer.save_model("./gdrive/MyDrive/tweet_classification")