In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 69

In [5]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
# Check distribution of dataset

data = pd.read_csv('finetuning_dataset.csv')
twitter_data = data[data['source'] == 'Twitter'].sample(frac=0.3, random_state=RANDOM_SEED)
other_data = data[data['source'] != 'Twitter']
sampled_data = pd.concat([twitter_data, other_data])

print(sampled_data['source'].value_counts().sum())

387528


In [7]:
# Perform train-test split
train_df, eval_df = train_test_split(
    sampled_data,
    test_size=0.1,
    stratify=sampled_data['source'],  # Preserve source distribution
    random_state=RANDOM_SEED
)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Inspect the splits
print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

Training size: 348775, Evaluation size: 38753


In [8]:
# Load Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.map(lambda examples: {'labels': torch.tensor(examples['polarity'], dtype=torch.long)})
tokenized_eval = tokenized_eval.map(lambda examples: {'labels': torch.tensor(examples['polarity'], dtype=torch.long)})

Map:   0%|          | 0/348775 [00:00<?, ? examples/s]

Map:   0%|          | 0/38753 [00:00<?, ? examples/s]

Map:   0%|          | 0/348775 [00:00<?, ? examples/s]

Map:   0%|          | 0/38753 [00:00<?, ? examples/s]

In [10]:
tokenized_train

Dataset({
    features: ['text', 'polarity', 'source', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 348775
})

In [11]:
tokenized_eval

Dataset({
    features: ['text', 'polarity', 'source', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 38753
})

In [12]:
# Inspect the First Row of Tokenized Train Dataset
row = tokenized_train[0]

# Print `input_ids`
print("Input IDs:", row['input_ids'])

# Decode Back to Text (Optional)
decoded_text = tokenizer.decode(row['input_ids'], skip_special_tokens=True)
print("Decoded Text:", decoded_text)

Input IDs: [101, 1996, 5790, 1045, 2435, 2323, 2941, 2022, 1037, 5717, 2349, 2000, 3532, 8013, 2326, 1012, 2044, 8110, 2041, 1996, 2433, 2006, 1996, 4037, 2445, 2006, 2026, 3025, 3319, 2035, 1045, 2363, 2001, 1037, 7514, 2011, 1000, 6207, 1000, 1999, 8013, 2326, 4129, 2033, 2000, 2655, 1012, 1045, 1005, 2310, 2525, 5287, 2055, 1996, 3573, 1012, 2023, 3006, 2006, 10688, 27136, 2015, 1010, 6719, 1998, 29407, 2135, 1012, 8013, 2326, 2005, 2023, 2194, 2003, 2512, 4839, 4630, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded Text: the rating i gave should actually be a zero due to poor customer service. after filling out the form on the website given on my previous review all i received was a reply by " apple " in customer service telling me to call. i've already spoken about the store. this market on speedway stinks, literally and figuratively. customer service for this compa

In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
batch_size = 16
epochs = 3
total_steps = (len(tokenized_train) // batch_size) * epochs

In [15]:
training_args = TrainingArguments(
    output_dir="./basemodel_results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate()

model.save_pretrained('./basemodel_sentiment_model')
tokenizer.save_pretrained('./basemodel_sentiment_model')

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2339,0.272031,0.892731,0.896378,0.861565,0.934123
2,0.1801,0.263124,0.905478,0.901794,0.931694,0.873753
3,0.1468,0.343867,0.909246,0.908481,0.910067,0.906899


('./basemodel_sentiment_model/tokenizer_config.json',
 './basemodel_sentiment_model/special_tokens_map.json',
 './basemodel_sentiment_model/vocab.txt',
 './basemodel_sentiment_model/added_tokens.json')

In [17]:
trainer.predict(tokenized_eval)

PredictionOutput(predictions=array([[ 3.6200027 , -3.0432713 ],
       [ 0.25838566, -0.622732  ],
       [ 3.6041794 , -2.993711  ],
       ...,
       [ 3.0676858 , -2.0811012 ],
       [ 2.5497437 , -1.5759307 ],
       [ 3.6805344 , -3.3739052 ]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.2631244361400604, 'test_accuracy': 0.9054782855520863, 'test_f1': 0.9017936137698062, 'test_precision': 0.9316935349842114, 'test_recall': 0.8737531172069826, 'test_runtime': 77.7548, 'test_samples_per_second': 498.4, 'test_steps_per_second': 31.162})

In [18]:
!zip -r basemodel_sentiment_model.zip basemodel_sentiment_model/

  adding: basemodel_sentiment_model/ (stored 0%)
  adding: basemodel_sentiment_model/config.json (deflated 49%)
  adding: basemodel_sentiment_model/model.safetensors (deflated 7%)
  adding: basemodel_sentiment_model/vocab.txt (deflated 53%)
  adding: basemodel_sentiment_model/tokenizer_config.json (deflated 75%)
  adding: basemodel_sentiment_model/special_tokens_map.json (deflated 42%)


In [19]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
!cp basemodel_sentiment_model.zip /content/drive/MyDrive/Capstone/