In [2]:
%%capture
%pip install transformers
%pip install evaluate

In [3]:
import pandas as pd
import datasets
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
train_df = pd.read_csv('./NLP Dataset/final_train.csv')
test_df = pd.read_csv('./NLP Dataset/final_test.csv')

# Rename the columns to match the expected column names
train_df = train_df.rename(columns={'unprocessed_text': 'text', 'class': 'label'})
test_df = test_df.rename(columns={'unprocessed_text': 'text', 'class': 'label'})

#remove tweeetid
train_df = train_df.drop('tweetid',axis=1,)
train_df = test_df.drop('tweetid',axis=1,)


In [4]:
train_df.head(25)

Unnamed: 0,text,label
0,she sprinkled adderall on her granola with ric...,m
1,vyvanse had me telling my life story to a 50 y...,c
2,@Elianok10 @letthembekids2 @invisable22 @kpana...,c
3,@lisarinna Now that i've got xanax in my smoot...,a
4,adderall had me doing sit ups at midnight,c
5,"i take cymbalta, gabapentin, morphine, flexeri...",c
6,"why does everyone want adderall today man, shi...",m
7,No Mattah what that hoe said or wanna switch u...,m
8,@melistittsss i got you with a xanax if you ne...,a
9,Have blurry vision?You probably have Autophili...,m


In [5]:
# Convert the labels to integers (m=2, c=1, a=0, u=3)
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

In [6]:
print(train_df['text'].dtype)     # Check data type of 'text' column in train_df
print(train_df['text'].shape)     # Check shape of 'text' column in train_df
print(test_df['text'].dtype)      # Check data type of 'text' column in test_df
print(test_df['text'].shape)      # Check shape of 'text' column in test_df
print(train_df['text'])
train_df.head(25)

object
(3271,)
object
(3271,)
0       she sprinkled adderall on her granola with ric...
1       vyvanse had me telling my life story to a 50 y...
2       @Elianok10 @letthembekids2 @invisable22 @kpana...
3       @lisarinna Now that i've got xanax in my smoot...
4               adderall had me doing sit ups at midnight
                              ...                        
3266    See that girl who definitely WASN'T smuggling ...
3267    that one time i took adderall and tried to wri...
3268    Here's to me taking penicillin thinking it was...
3269    @manofletterspdx @hotspurjp I bet I'd have a h...
3270    @UKinUSA hi. Can you possibly advise. Can I br...
Name: text, Length: 3271, dtype: object


Unnamed: 0,text,label
0,she sprinkled adderall on her granola with ric...,2
1,vyvanse had me telling my life story to a 50 y...,1
2,@Elianok10 @letthembekids2 @invisable22 @kpana...,1
3,@lisarinna Now that i've got xanax in my smoot...,0
4,adderall had me doing sit ups at midnight,1
5,"i take cymbalta, gabapentin, morphine, flexeri...",1
6,"why does everyone want adderall today man, shi...",2
7,No Mattah what that hoe said or wanna switch u...,2
8,@melistittsss i got you with a xanax if you ne...,0
9,Have blurry vision?You probably have Autophili...,2


In [7]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [8]:
# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

tokenized_train_dataset = datasets.Dataset.from_pandas(train_df)
tokenized_train_dataset = tokenized_train_dataset.map(tokenize, batched=True)

tokenized_test_dataset = datasets.Dataset.from_pandas(test_df)
tokenized_test_dataset = tokenized_test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3271 [00:00<?, ? examples/s]

Map:   0%|          | 0/3271 [00:00<?, ? examples/s]

In [9]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3271
})

In [10]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results2',          # Output directory
    evaluation_strategy = "epoch",   # Evaluation strategy
    # learning_rate=2e-5,              # Learning rate
    # per_device_train_batch_size=16,  # Batch size
    # num_train_epochs=3,              # Number of training epochs
    # weight_decay=0.01,               # Weight decay
    # push_to_hub=False,
)

In [11]:
# Define the trainer
trainer = Trainer(
    model=model,                         # The model to be trained
    args=training_args,                  # Training arguments
    train_dataset=tokenized_train_dataset, # Training dataset
    eval_dataset=tokenized_test_dataset,  # Evaluation dataset
)

In [12]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3271
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1227
  Number of trainable parameters = 124648708


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1227 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

{'eval_loss': 0.5760082006454468, 'eval_runtime': 557.5498, 'eval_samples_per_second': 5.867, 'eval_steps_per_second': 0.734, 'epoch': 1.0}


Saving model checkpoint to ./results2/checkpoint-500
Configuration saved in ./results2/checkpoint-500/config.json


{'loss': 0.8375, 'learning_rate': 2.962510187449063e-05, 'epoch': 1.22}


Model weights saved in ./results2/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

{'eval_loss': 0.3344159424304962, 'eval_runtime': 556.1039, 'eval_samples_per_second': 5.882, 'eval_steps_per_second': 0.735, 'epoch': 2.0}


Saving model checkpoint to ./results2/checkpoint-1000
Configuration saved in ./results2/checkpoint-1000/config.json


{'loss': 0.5141, 'learning_rate': 9.250203748981255e-06, 'epoch': 2.44}


Model weights saved in ./results2/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.2156720757484436, 'eval_runtime': 555.4765, 'eval_samples_per_second': 5.889, 'eval_steps_per_second': 0.736, 'epoch': 3.0}
{'train_runtime': 17380.6174, 'train_samples_per_second': 0.565, 'train_steps_per_second': 0.071, 'train_loss': 0.6210866056036541, 'epoch': 3.0}


TrainOutput(global_step=1227, training_loss=0.6210866056036541, metrics={'train_runtime': 17380.6174, 'train_samples_per_second': 0.565, 'train_steps_per_second': 0.071, 'train_loss': 0.6210866056036541, 'epoch': 3.0})

In [13]:
# Evaluate the model
eval_results = trainer.evaluate(tokenized_test_dataset)
print(eval_results)

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

{'eval_loss': 0.2156720757484436, 'eval_runtime': 572.8592, 'eval_samples_per_second': 5.71, 'eval_steps_per_second': 0.714, 'epoch': 3.0}


In [15]:
tokenized_train_dataset['attention_mask'][0][-1]

0

In [16]:
#calculate overall f1 score

import numpy as np

# Make predictions on the test set
predictions, _, _ = trainer.predict(tokenized_test_dataset)

# Convert predicted probabilities to predicted classes
predicted_classes = np.argmax(predictions, axis=1)

# Calculate F1 score
from sklearn.metrics import f1_score
f1 = f1_score(test_df['label'], predicted_classes, average='macro')
print("F1 score:", f1)

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

F1 score: 0.922167434579988


In [17]:
# Calculate F1 score for label 0 (a)
f1 = f1_score(test_df['label'], predicted_classes, labels=[0], average='macro')
print("F1 score for label 0:", f1)

F1 score for label 0: 0.8471794871794872
