In [1]:
# Import modules
import sys
sys.path.append('..')
from src.data_processing import DataPreprocessor
from src.model_training import ModelTrainer
import numpy as np

In [2]:
# Setup
model_name = "distilbert-base-uncased"
output_dir = "../models/sentiment_model"

In [None]:
# Prepare data
print("Preparing dataset...")
preprocessor = DataPreprocessor(model_name, max_length=256)
tokenized_dataset = preprocessor.load_and_prepare_data(
    "imdb",
    # subset_size=1000
)

Preparing dataset...
Loading imdb...
Tokenizing the dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [4]:
# Cell 4: VERIFY label balance
train_labels = np.array(tokenized_dataset['train']['labels'])
test_labels = np.array(tokenized_dataset['test']['labels'])

print("Training set label distribution:")
print(f"  Negative (0): {np.sum(train_labels == 0)} ({np.mean(train_labels == 0)*100:.1f}%)")
print(f"  Positive (1): {np.sum(train_labels == 1)} ({np.mean(train_labels == 1)*100:.1f}%)")

print("\nTest set label distribution:")
print(f"  Negative (0): {np.sum(test_labels == 0)} ({np.mean(test_labels == 0)*100:.1f}%)")
print(f"  Positive (1): {np.sum(test_labels == 1)} ({np.mean(test_labels == 1)*100:.1f}%)")

Training set label distribution:
  Negative (0): 12500 (50.0%)
  Positive (1): 12500 (50.0%)

Test set label distribution:
  Negative (0): 12500 (50.0%)
  Positive (1): 12500 (50.0%)


In [5]:
# Initialize trainer
trainer_obj = ModelTrainer(model_name, num_labels=2)
training_args = trainer_obj.setup_training(output_dir=output_dir)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loading model: distilbert-base-uncased


In [6]:
# Train model
trainer = trainer_obj.train(
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    training_args=training_args
)

Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3027,0.303977,0.89692,0.942083,0.84584,0.891371
2,0.1959,0.327958,0.91492,0.919111,0.90992,0.914492
3,0.0973,0.392854,0.91324,0.912284,0.9144,0.913341


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Model saved to ../models/sentiment_model


In [7]:
# Evaluate on test set
print("\nFinal evaluation on test set:")
results = trainer.evaluate()
for key, value in results.items():
    print(f"{key}: {value:.4f}")


Final evaluation on test set:


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)


eval_loss: 0.3280
eval_accuracy: 0.9149
eval_precision: 0.9191
eval_recall: 0.9099
eval_f1: 0.9145
eval_runtime: 236.3583
eval_samples_per_second: 105.7720
eval_steps_per_second: 6.6130
epoch: 3.0000


In [8]:
# Save tokenizer needed for inference
preprocessor.tokenizer.save_pretrained(output_dir)
print(f"\nTokenizer saved to {output_dir}")


Tokenizer saved to ../models/sentiment_model
