<a href="https://colab.research.google.com/github/Am-ShivA/Am-ShivA/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import nlp
import warnings
import tokenizers
import transformers

from sklearn.metrics import accuracy_score

In [8]:
!pip install transformers



In [9]:
!pip install nlp



In [10]:
!pip install tokenizers



In [11]:
warnings.filterwarnings('ignore')

In [12]:
train, test = nlp.load_dataset("emo", split = ["train", "test"])

Downloading:   0%|          | 0.00/5.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Downloading and preparing dataset emo/emo2019 (download: 3.21 MiB, generated: 2.72 MiB, post-processed: Unknown sizetotal: 5.93 MiB) to /root/.cache/huggingface/datasets/emo/emo2019/1.0.0/5fa43514af79018263c393e3a7d72d7ad03e8c44563e4c27bd3173bf9418b578...


Downloading:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/495k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset emo downloaded and prepared to /root/.cache/huggingface/datasets/emo/emo2019/1.0.0/5fa43514af79018263c393e3a7d72d7ad03e8c44563e4c27bd3173bf9418b578. Subsequent calls will reuse this data.


In [13]:
train_text = " ".join([i["text"] for i in train])
test_text = " ".join([i["text"] for i in test])

In [14]:
import os

# Define the directory path
dir_path = '../data'

# Check if the directory exists
if not os.path.exists(dir_path):
    # If not, create the directory
    os.makedirs(dir_path)

# Now you can write to the files
with open(os.path.join(dir_path, 'train.txt'), 'w') as f:
    f.write(train_text)
with open(os.path.join(dir_path, 'test.txt'), 'w') as f:
    f.write(test_text)


In [15]:
with open('../data/train.txt', 'w') as f:
    f.write(train_text)
with open('../data/test.txt', 'w') as f:
    f.write(test_text)

In [16]:
tokenizer = tokenizers.BertWordPieceTokenizer()

In [17]:
vocab_size = 5000

tokenizer.train(files = ['../data/train.txt', '../data/test.txt'], \
                vocab_size = vocab_size, min_frequency = 50)

In [18]:

# Define the directory path
dir_path = '../tokenizers/emo-mobilebert/'

# Check if the directory exists
if not os.path.exists(dir_path):
    # If not, create the directory
    os.makedirs(dir_path)

# Now you can save the tokenizer model
tokenizer.save_model(dir_path)


['../tokenizers/emo-mobilebert/vocab.txt']

In [19]:
tokenizer = transformers.MobileBertTokenizerFast.from_pretrained('../tokenizers/emo-mobilebert/')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'MobileBertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'MobileBertTokenizerFast'.


In [20]:
config = transformers.MobileBertConfig(vocab_size = len(tokenizer.get_vocab()))

In [21]:
config.num_labels = 4

In [22]:
config.max_length = 128

In [23]:
id2label = {}
for i in range(config.num_labels):
    id2label[i] = train.features["label"].int2str(i)
id2label

{0: 'others', 1: 'happy', 2: 'sad', 3: 'angry'}

In [24]:
config.id2label = id2label

In [25]:
config.label2id = {v:k for k,v in id2label.items()}

In [26]:
model = transformers.MobileBertForSequenceClassification(config)

In [27]:
model.config

MobileBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": true,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "id2label": {
    "0": "others",
    "1": "happy",
    "2": "sad",
    "3": "angry"
  },
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "label2id": {
    "angry": 3,
    "happy": 1,
    "others": 0,
    "sad": 2
  },
  "layer_norm_eps": 1e-12,
  "max_length": 128,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "transformers_version": "4.34.1",
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 2016
}

In [28]:
def tokenize(batch):
    return tokenizer(batch['text'], padding = True)

In [29]:
train_dataset = train.map(tokenize, batched = True, batch_size = len(train))
test_dataset = test.map(tokenize, batched = True, batch_size = len(train))
train_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
pip install dill==0.3.5.1




In [31]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
    }

In [32]:
training_args = transformers.TrainingArguments(
    output_dir = './results',
    num_train_epochs = 10,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs',
)

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)


In [33]:
pip install accelerate -U




In [34]:
trainer.train()

Step,Training Loss
500,1.2607
1000,1.0643
1500,0.8262
2000,0.72
2500,0.6639
3000,0.641
3500,0.6017
4000,0.5515
4500,0.5152
5000,0.5222


TrainOutput(global_step=18850, training_loss=0.4833225790575265, metrics={'train_runtime': 5128.9915, 'train_samples_per_second': 58.803, 'train_steps_per_second': 3.675, 'total_flos': 6996866253062400.0, 'train_loss': 0.4833225790575265, 'epoch': 10.0})

In [35]:
trainer.evaluate()

{'eval_loss': 0.4263741970062256,
 'eval_accuracy': 0.8565982937012162,
 'eval_runtime': 40.5151,
 'eval_samples_per_second': 135.974,
 'eval_steps_per_second': 4.27,
 'epoch': 10.0}

In [36]:
trainer.save_model("../models/emo-mobilebert/")

In [37]:
tokenizer.save_pretrained("../tokenizers/emo-mobilebert/")

('../tokenizers/emo-mobilebert/tokenizer_config.json',
 '../tokenizers/emo-mobilebert/special_tokens_map.json',
 '../tokenizers/emo-mobilebert/vocab.txt',
 '../tokenizers/emo-mobilebert/added_tokens.json',
 '../tokenizers/emo-mobilebert/tokenizer.json')

In [38]:
transformer = transformers.AutoModelForSequenceClassification.from_pretrained("../models/emo-mobilebert/")

In [39]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../tokenizers/emo-mobilebert/")

In [40]:
nlp_sentence_classif = transformers.pipeline('sentiment-analysis', model = transformer, tokenizer = tokenizer)
nlp_sentence_classif("I've never had such a bad day in my life")

[{'label': 'sad', 'score': 0.9783978462219238}]

In [41]:
nlp_sentence_classif = transformers.pipeline('sentiment-analysis', model = transformer, tokenizer = tokenizer)
nlp_sentence_classif("i am very happy today, lets party.")

[{'label': 'happy', 'score': 0.9835114479064941}]

In [49]:
nlp_sentence_classif = transformers.pipeline('sentiment-analysis', model = transformer, tokenizer = tokenizer)
nlp_sentence_classif("The joke was very funny. He was laughing. ")

[{'label': 'happy', 'score': 0.9868102073669434}]