In [81]:
# Category 1, Dataset Preparation 

# 1. Synthetic Dataset Creation and Augmentation 

import torch
import transformers
import nlpaug.augmenter.word as naw
import torch
print("NLPAug imported successfully")
import random

print(f"Torch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.get_device_name(0)}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# Sample positive and negative reviews
positive_reviews = [
    "This product is amazing!",
    "I highly recommend this.",
    "It's the best I've ever used.",
    "Excellent quality and value.",
    "Five stars!",
    "Great customer service.",
    "Exactly what I was looking for.",
    "Very satisfied with my purchase.",
    "Outstanding performance.",
    "Worth every penny!"
]

negative_reviews = [
    "This product is terrible.",
    "I would not recommend this.",
    "It's the worst I've ever used.",
    "Poor quality and overpriced.",
    "One star!",
    "Horrible customer service.",
    "Not what I expected at all.",
    "Very disappointed with my purchase.",
    "Unreliable performance.",
    "Complete waste of money!"
]

# Generate 500 base sentences by repeating and slightly modifying the samples
all_reviews = []
for i in range(250):
    # Add some random variation to avoid exact duplicates
    pos_review = positive_reviews[i % len(positive_reviews)]
    neg_review = negative_reviews[i % len(negative_reviews)]

    # Add simple variations to make the dataset more diverse
    if random.random() > 0.5:
        pos_review = "Honestly, " + pos_review
    if random.random() > 0.5:
        neg_review = "Unfortunately, " + neg_review

    all_reviews.append(pos_review)
    all_reviews.append(neg_review)

# Initialize augmenters
aug_insert = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="insert",
    aug_p=0.1  # Probability of augmenting each word
)

aug_sub = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    aug_p=0.1,
    stopwords=['not', 'no', 'never']  # Prevent changing sentiment-critical words
)


# Augment the dataset
augmented_reviews = []
for review in all_reviews:
    try:
        # Insert words
        aug_text = aug_insert.augment(review)[0]
        augmented_reviews.append(aug_text)

        # Substitute words
        aug_text = aug_sub.augment(review)[0]
        augmented_reviews.append(aug_text)

        # Simple word deletion (manual approach)
        words = review.split()
        if len(words) > 3:  # Only delete if we have enough words
            del_idx = random.randint(0, len(words)-1)
            words.pop(del_idx)
            aug_text = " ".join(words)
            augmented_reviews.append(aug_text)

    except Exception as e:
        print(f"Error augmenting review: {review}")
        print(f"Error message: {str(e)}")
        continue

# Combine original and augmented reviews
final_reviews = all_reviews + augmented_reviews

# Print some statistics
print(f"Original reviews: {len(all_reviews)}")
print(f"Augmented reviews: {len(augmented_reviews)}")
print(f"Total reviews: {len(final_reviews)}")

# Print some examples
print("\nExample augmentations:")
for i in range(3):
    orig_idx = random.randint(0, len(all_reviews)-1)
    aug_idx = random.randint(0, len(augmented_reviews)-1)
    print(f"\nOriginal: {all_reviews[orig_idx]}")
    print(f"Augmented: {augmented_reviews[aug_idx]}")

NLPAug imported successfully
Torch version: 2.5.1+cu121
Transformers version: 4.48.1
CUDA available: True
Current device: NVIDIA GeForce RTX 4080 SUPER
cuda
Original reviews: 500
Augmented reviews: 1362
Total reviews: 1862

Example augmentations:

Original: Outstanding performance.
Augmented: not what i originally expected at all.

Original: Honestly, It's the best I've ever used.
Augmented: fortunately, great customer service.

Original: Honestly, Exactly what I was looking for.
Augmented: Honestly, This is amazing!


In [None]:
# 2. Handling Missing Values


from transformers import pipeline
import random
import torch
from tqdm import tqdm

class ReviewReconstructor:
    def __init__(self, device=None):
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.generator = self._create_generator()
        # Sentiment keywords for better context understanding
        self.positive_keywords = {'amazing', 'recommend', 'best', 'excellent', 'stars', 'great', 
                                'exactly', 'satisfied', 'outstanding', 'worth'}
        self.negative_keywords = {'terrible', 'not', 'worst', 'poor', 'horrible', 'disappointed', 
                                'unreliable', 'waste', 'unfortunately'}

    def _create_generator(self):
        return pipeline(
            "text2text-generation",
            model="t5-base",
            device=0 if self.device == 'cuda' else -1,
            clean_up_tokenization_spaces=True
        )

    def _detect_sentiment(self, text):
        """Detect sentiment based on keyword presence."""
        text_lower = text.lower()
        pos_count = sum(1 for word in self.positive_keywords if word in text_lower)
        neg_count = sum(1 for word in self.negative_keywords if word in text_lower)
        return 'positive' if pos_count > neg_count else 'negative'

    def _get_context(self, text_list, current_idx):
        # Get surrounding context
        prev_texts = [t for t in text_list[max(0, current_idx - 2):current_idx] if t.strip()]
        next_texts = [t for t in text_list[current_idx + 1:current_idx + 3] if t.strip()]
        
        # Combine context
        context_text = " ".join(prev_texts + next_texts)
        sentiment = self._detect_sentiment(context_text)
        
        # Format prompt with sentiment guidance
        prompt = f"complete {sentiment} review:"
        if prev_texts:
            prompt += f" {' '.join(prev_texts)}"
        prompt += " [MISSING]"
        if next_texts:
            prompt += f" {' '.join(next_texts)}"
            
        return prompt, sentiment

    def _clean_generated_text(self, text, sentiment):
        """Clean and validate generated text."""
        # Remove common prefix artifacts
        artifacts = [
            "review::", "negative review:", "complete positive review:",
            "complete negative review:", "positive review:", "complete review:", ":", "True"
        ]
        for artifact in artifacts:
            text = text.replace(artifact, "").strip()

        # Remove [MISSING] placeholders
        text = text.replace("[MISSING]", "").strip()

        # Ensure proper sentence structure
        if len(text.split()) < 3:
            text = "This product is excellent!" if sentiment == 'positive' else "This product is disappointing."

        # Ensure proper ending punctuation
        if not any(text.endswith(char) for char in ".!?"):
            text += "."

        return text

    def reconstruct_texts(self, reviews, missing_indices, batch_size=32):
        """Reconstruct missing texts with batched processing."""
        reconstructed = reviews.copy()

        # Process in batches
        for i in tqdm(range(0, len(missing_indices), batch_size)):
            batch_indices = missing_indices[i:i + batch_size]
            prompts = []
            sentiments = []
            
            # Prepare batch
            for idx in batch_indices:
                prompt, sentiment = self._get_context(reviews, idx)
                prompts.append(prompt)
                sentiments.append(sentiment)
            
            # Generate texts
            generated = self.generator(
                prompts,
                do_sample=True,
                temperature=0.7,
                top_p=0.7,
                top_k=25,
                repetition_penalty=1.4,
                num_return_sequences=1,
                no_repeat_ngram_size=3,
                max_length=50,
                batch_size=batch_size
            )
            
            # Process generated texts
            for idx, gen, sentiment in zip(batch_indices, generated, sentiments):
                # The generated output is already a dictionary with 'generated_text' key
                text = gen['generated_text']  # Removed the [0] indexing
                cleaned_text = self._clean_generated_text(text, sentiment)
                reconstructed[idx] = cleaned_text
            
        return reconstructed

# Initialize reconstructor
reconstructor = ReviewReconstructor()
missing_percentage=0.1

# Create gaps
num_missing = int(len(final_reviews) * missing_percentage)
missing_indices = random.sample(range(len(final_reviews)), num_missing)
reviews_with_gaps = final_reviews.copy()

for idx in missing_indices:
    reviews_with_gaps[idx] = ""

# Reconstruct
reconstructed = reconstructor.reconstruct_texts(reviews_with_gaps, missing_indices)

# Print results
print(f"\nTotal reviews: {len(final_reviews)}")
print(f"Gaps created: {num_missing}")
print("\nReconstruction Examples:")

# Show some examples
sample_size = min(5, len(missing_indices))
for idx in random.sample(missing_indices, sample_size):
    print(f"Reconstructed: {reconstructed[idx]}")
    


Device set to use cuda:0
100%|██████████| 6/6 [00:07<00:00,  1.26s/it]


Total reviews: 1867
Gaps created: 186

Reconstruction Examples:
Reconstructed: a total  this product is just terrible. this product screams terrible.
Reconstructed: review poor work and overpriced. Poor and over priced.
Reconstructed: this product line is terrible. this product appeared terrible.  honestly, i personally recommend this.
Reconstructed: Sadly, Very disappointed with my purchase.  Unfortunately, Unreliable performance.
Reconstructed: a complete  Unfortunately, Not what I expected at all.





In [99]:
# 3. Kaggle Dataset Preprocessing 

# Install required packages
#!pip install kaggle transformers scikit-learn pandas numpy

import os
import pandas as pd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset

# Configure Kaggle API (you'll need to upload your kaggle.json)
os.environ['KAGGLE_CONFIG_DIR'] = '/content'

# Download IMDB dataset
#!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

# Extract dataset
#!unzip imdb-dataset-of-50k-movie-reviews.zip

# Load and preprocess data
#df = pd.read_csv('IMDB Dataset.csv')
df = pd.read_csv('C:/Users/Kone/Downloads/archive/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['review'].values, 
    df['sentiment'].values,
    test_size=0.2,
    random_state=42
)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Baseline Model (Logistic Regression)
# Tokenize with basic approach
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train logistic regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_preds = lr_model.predict(X_test_tfidf)
print("\nLogistic Regression Results:")
print(classification_report(y_test, lr_preds))

# Transformer Model (TinyBERT)
# Prepare datasets
train_dataset = Dataset.from_dict({
    'text': X_train,
    'label': y_train
})
test_dataset = Dataset.from_dict({
    'text': X_test,
    'label': y_test
})

# Tokenize datasets
train_dataset = train_dataset.map(
    lambda x: tokenizer(x['text'], padding=True, truncation=True),
    batched=True
)
test_dataset = test_dataset.map(
    lambda x: tokenizer(x['text'], padding=True, truncation=True),
    batched=True
)

# Load TinyBERT
model = AutoModelForSequenceClassification.from_pretrained(
    'prajjwal1/bert-tiny',
    num_labels=2
)
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Training parameters
num_epochs = 1
batch_size = 16
learning_rate = 2e-5

# Create data loaders
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

bert_preds = np.array(predictions)
print("\nTinyBERT Results:")
print(classification_report(true_labels, bert_preds))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.90      0.90      0.89     10000



Map: 100%|██████████| 40000/40000 [00:24<00:00, 1653.68 examples/s]
Map: 100%|██████████| 10000/10000 [00:05<00:00, 1672.08 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.44105073331296446

TinyBERT Results:
              precision    recall  f1-score   support

           0       0.89      0.84      0.86      4961
           1       0.85      0.90      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
# Category 2: Tokenization

# Install required packages
#!pip install transformers tokenizers datasets

from transformers import AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from datasets import load_dataset
import pandas as pd

# 4. Tokenizer Comparison
def compare_tokenizers():
    # Initialize tokenizers
    tokenizers = {
        'BERT': AutoTokenizer.from_pretrained('bert-base-uncased'),
        'GPT2': AutoTokenizer.from_pretrained('gpt2'),
        'RoBERTa': AutoTokenizer.from_pretrained('roberta-base')
    }
    
    # Sample text for comparison
    text = "The quick brown fox jumps over the lazy dog! Let's see how different tokenizers handle this."
    
    results = []
    for name, tokenizer in tokenizers.items():
        tokens = tokenizer.tokenize(text)
        results.append({
            'Tokenizer': name,
            'Tokens': tokens,
            'Num_Tokens': len(tokens),
            'Vocabulary_Size': tokenizer.vocab_size
        })
    
    return pd.DataFrame(results)

# 5. Custom Tokenizer Training 
def train_custom_tokenizer():
    # Load a small dataset for training
    dataset = load_dataset("imdb", split="train[:1000]")
    
    # Initialize a new tokenizer (BPE model)
    tokenizer = Tokenizer(models.BPE())
    
    # Set up pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    
    # Prepare training
    trainer = trainers.BpeTrainer(
        vocab_size=25000,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    )
    
    # Create iterator of texts for training
    def batch_iterator():
        batch_size = 1000
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]
    
    # Train tokenizer
    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
    
    # Save tokenizer
    tokenizer.save("custom_tokenizer.json")
    
    return tokenizer

# Test Custom Tokenizer
def test_tokenizers(custom_tokenizer):
    # Test text
    test_text = "This is a test of our custom tokenizer! Let's see how it performs."
    
    # Load saved custom tokenizer
    custom_tokenizer = Tokenizer.from_file("custom_tokenizer.json")
    
    # Encode and decode
    encoded = custom_tokenizer.encode(test_text)
    decoded = custom_tokenizer.decode(encoded.ids)
    
    return {
        'Original': test_text,
        'Encoded_IDs': encoded.ids[:10],  # First 10 tokens
        'Decoded': decoded,
        'Num_Tokens': len(encoded.ids)
    }

# Run comparisons
print("Comparing standard tokenizers:")
comparison_df = compare_tokenizers()
print(comparison_df)

print("\nTraining custom tokenizer...")
custom_tokenizer = train_custom_tokenizer()

print("\nTesting custom tokenizer:")
test_results = test_tokenizers(custom_tokenizer)
print(test_results)

# Optional: Save custom tokenizer for reuse
#custom_tokenizer.save_pretrained("./custom_tokenizer") todo

Comparing standard tokenizers:
  Tokenizer                                             Tokens  Num_Tokens  \
0      BERT  [the, quick, brown, fox, jumps, over, the, laz...          22   
1      GPT2  [The, Ġquick, Ġbrown, Ġfox, Ġjumps, Ġover, Ġth...          20   
2   RoBERTa  [The, Ġquick, Ġbrown, Ġfox, Ġjumps, Ġover, Ġth...          20   

   Vocabulary_Size  
0            30522  
1            50257  
2            50265  

Training custom tokenizer...

Testing custom tokenizer:
{'Original': "This is a test of our custom tokenizer! Let's see how it performs.", 'Encoded_IDs': [417, 162, 120, 3288, 146, 1427, 6741, 6900, 9325, 5], 'Decoded': "ĠThis Ġis Ġa Ġtest Ġof Ġour Ġcustom Ġtoken izer ! ĠLet 's Ġsee Ġhow Ġit Ġperforms .", 'Num_Tokens': 17}


In [10]:
# Category 3: Pre-trained Models 
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    BlipProcessor, 
    BlipForConditionalGeneration
)
from datasets import load_dataset
import numpy as np
from PIL import Image
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.encodings = tokenizer(
            dataset["text" if "text" in dataset.features else "sms"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        self.labels = torch.tensor(dataset["label"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

def train_model(model, train_loader, val_loader, num_epochs, device, learning_rate=2e-5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    model.to(device)
    
    best_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                val_loss += outputs.loss.item()
                predictions = torch.argmax(outputs.logits, dim=-1)
                correct += (predictions == labels).sum().item()
                total += labels.size(0)
        
        accuracy = correct / total
        print(f"Epoch {epoch + 1}: Train Loss = {total_loss/len(train_loader):.4f}, "
              f"Val Loss = {val_loss/len(val_loader):.4f}, "
              f"Val Accuracy = {accuracy:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), f"best_model_epoch_{epoch+1}.pt")
    
    return model

def train_distilbert():
    dataset = load_dataset("imdb")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2
    )
    
    train_dataset = TextDataset(dataset["train"], tokenizer)
    val_dataset = TextDataset(dataset["test"], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = train_model(model, train_loader, val_loader, num_epochs=3, device=device)
    
    return model, tokenizer

def train_bert_spam():
    dataset = load_dataset("sms_spam")
    split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    )
    
    train_dataset = TextDataset(split_dataset["train"], tokenizer)
    val_dataset = TextDataset(split_dataset["test"], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = train_model(model, train_loader, val_loader, num_epochs=1, device=device)
    
    return model, tokenizer

def setup_blip_captioning():
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    
    def generate_caption(image_path, processor, model):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        image = Image.open(image_path).convert('RGB')
        inputs = processor(images=image, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(**inputs)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        
        return caption
    
    def translate_caption(caption):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        translator = AutoModelForSeq2SeqLM.from_pretrained(f"Helsinki-NLP/opus-mt-tc-big-en-fi")
        translator_tokenizer = AutoTokenizer.from_pretrained(f"Helsinki-NLP/opus-mt-tc-big-en-fi")
        
        translator.to(device)
        inputs = translator_tokenizer(caption, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = translator.generate(**inputs)
        translation = translator_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return translation
    
    return processor, model, generate_caption, translate_caption

In [3]:
print("Training DistilBERT for sentiment analysis...")
distilbert_model, distilbert_tokenizer = train_distilbert()

Training DistilBERT for sentiment analysis...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 1563/1563 [03:11<00:00,  8.16it/s]


Epoch 1: Train Loss = 0.2523, Val Loss = 0.2294, Val Accuracy = 0.9048


Epoch 2/3: 100%|██████████| 1563/1563 [03:12<00:00,  8.12it/s]


Epoch 2: Train Loss = 0.1404, Val Loss = 0.2250, Val Accuracy = 0.9210


Epoch 3/3: 100%|██████████| 1563/1563 [03:11<00:00,  8.17it/s]


Epoch 3: Train Loss = 0.0716, Val Loss = 0.2275, Val Accuracy = 0.9295


In [11]:
print("\nTraining BERT for spam classification...")
bert_model, bert_tokenizer = train_bert_spam()


Training BERT for spam classification...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/1: 100%|██████████| 279/279 [01:07<00:00,  4.15it/s]


Epoch 1: Train Loss = 0.0768, Val Loss = 0.0499, Val Accuracy = 0.9883


In [2]:
print("\nSetting up BLIP for image captioning...")
blip_processor, blip_model, caption_fn, translate_fn = setup_blip_captioning()



# Test BLIP with sample images
sample_images = ["image-1.jpg", "image-2.jpg", "image-3.jpg", "image-4.jpg", "image-5.jpg"]
for img_path in sample_images:
    try:
        caption = caption_fn(img_path, blip_processor, blip_model)
        translation = translate_fn(caption)
        print(f"\nImage: {img_path}")
        print(f"Caption: {caption}")
        print(f"Translation: {translation}")
    except FileNotFoundError:
        print(f"Image {img_path} not found. Please provide valid image paths.")


Setting up BLIP for image captioning...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



Image: image-1.jpg
Caption: a bowl of oranges with a half of a grape
Translation: kulhollinen appelsiineja, joissa on puolikas rypälettä

Image: image-2.jpg
Caption: the tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori
Translation: tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori tori

Image: image-3.jpg
Caption: a fluffy orange cat with a white face
Translation: pörröinen oranssi kissa, jolla on valkoiset kasvot

Image: image-4.jpg
Caption: the old bridge in mostar, bosnia
Translation: Mostarissa sijaitseva vanha silta, bosnia

Image: image-5.jpg
Caption: a river with trees and bushes in the background
Translation: joki, jonka taustalla on puita ja pensaita


In [66]:
# Categorimport os
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import numpy as np
import torch
from typing import Dict

# Get the directory where the notebook is located
notebook_dir = os.path.dirname(os.path.abspath('C:/Users/Kone/ftllm/wk02/ex02.ipynb'))

# Create paths relative to the notebook directory
results_dir = os.path.join(notebook_dir, 'ray_tune_results')
logs_dir = os.path.join(notebook_dir, 'ray_logs')

# Create directories
os.makedirs(results_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize ray
ray.shutdown()
ray.init(num_gpus=1)
print(ray.cluster_resources())
print(torch.cuda.is_available())

class TuneTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._report_to_ray_metrics = True

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if self._report_to_ray_metrics:
            self._report_to_ray(metrics, state.global_step)
        return super().on_evaluate(args, state, control, metrics, **kwargs)

def model_init():
    """Initialize a new model for each trial"""
    return AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    )

def hp_space(trial):
    """Define the hyperparameter search space"""
    return {
        "learning_rate": tune.loguniform(1e-5, 1e-3),
        "per_device_train_batch_size": tune.choice([16, 32, 64]),
        "per_device_eval_batch_size": tune.choice([16, 32, 64]),
        "num_train_epochs": 3,
        "warmup_steps": tune.choice([100, 200, 300]),
        "dropout": tune.uniform(0.1, 0.5)
    }

def train_with_ray_tune(num_trials=20):
    # Load dataset (using IMDB for example)
    dataset = load_dataset("imdb")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Tokenize function
    def tokenize_function(examples):
        # Preserve the label information
        encoding = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512
        )
        encoding["labels"] = examples["label"]  # Retain the labels
        return encoding
    
    # Prepare dataset
    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    
    # Define training arguments template
    training_args = TrainingArguments(
        output_dir=results_dir,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="no",
        logging_dir=logs_dir,
    )
    
    # Define ASHA scheduler
    scheduler = ASHAScheduler(
        max_t=3,
        grace_period=1,
        reduction_factor=2,
        brackets=1,
    )
    
    def train_func(config):
        # Update training arguments with trial config
        for key, value in config.items():
            if hasattr(training_args, key):
                setattr(training_args, key, value)
        
        # Initialize trainer
        trainer = TuneTrainer(
            model_init=model_init,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
        )
        
        # Train and evaluate
        trainer.train()
    
    # Define a shorter trial directory name creator
    def trial_name_creator(trial):
        return f"t_{trial.trial_id}"

    analysis = tune.run(
        train_func,
        config=hp_space(None),
        storage_path="C:/ray_temp",
        trial_dirname_creator=trial_name_creator,
        name="tune_bert",
        num_samples=num_trials,
        scheduler=scheduler,
        metric="accuracy",
        mode="max",
        resources_per_trial={
            "cpu": 1,
            "gpu": 1 if torch.cuda.is_available() else 0
        },
        progress_reporter=tune.CLIReporter()
    )
    
    # Get best trial
    best_trial = analysis.get_best_trial("accuracy", "max", "last")
    print("Best trial config:", best_trial.config)
    print("Best trial final accuracy:", best_trial.last_result["accuracy"])
    
    return best_trial.config

# Run hyperparameter optimization
print("Starting hyperparameter optimization...")
best_config = train_with_ray_tune(num_trials=20)

# Save best configuration
import json
with open("best_hyperparameters.json", "w") as f:
    json.dump(best_config, f)

print("\nHyperparameter optimization complete!")
print("Best configuration saved to best_hyperparameters.json")


2025-02-02 13:43:54,067	INFO worker.py:1841 -- Started a local Ray instance.


{'accelerator_type:G': 1.0, 'node:__internal_head__': 1.0, 'CPU': 16.0, 'memory': 10202755892.0, 'node:127.0.0.1': 1.0, 'object_store_memory': 5101377945.0, 'GPU': 1.0}
True
Starting hyperparameter optimization...


2025-02-02 13:44:02,331	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-02-02 13:44:03 (running for 00:00:00.82)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/Kone/AppData/Local/Temp/ray/session_2025-02-02_13-43-52_377343_24792/artifacts/2025-02-02_13-44-02/tune_bert/driver_artifacts
Number of trials: 20/20 (20 PENDING)


== Status ==
Current time: 2025-02-02 13:44:08 (running for 00:00:05.84)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/Kone/AppData/Local/Temp/ray/session_2025-02-02_13-43-52_377343_24792/artifacts/2025-02-02_13-44-02/tune_bert/driver_artifacts
Number of trials: 20/20 (20 PENDING)




[36m(train_func pid=25824)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(train_func pid=25824)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(train_func pid=25824)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(train_func pid=25824)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/4689 [00:00<?, ?it/s]
2025-02-02 13:44:11,126	ERROR tune_controller.py:1331 -- Trial task failed for trial train_func_ff99a_00000
Traceback (most recent call last):
  File "c:\Users\Kone\ftllm\yes\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 

<IPython.core.display.HTML object>
== Status ==
Current time: 2025-02-02 13:44:13 (running for 00:00:10.88)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/Kone/AppData/Local/Temp/ray/session_2025-02-02_13-43-52_377343_24792/artifacts/2025-02-02_13-44-02/tune_bert/driver_artifacts
Number of trials: 20/20 (1 ERROR, 19 PENDING)
Number of errored trials: 1
+------------------------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             |   # failures | error file                                                                                                                                                     |
|------------------------+--------------+---------------------------------------------------------------

[36m(train_func pid=30712)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(train_func pid=30712)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2025-02-02 13:44:18 (running for 00:00:15.93)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/Kone/AppData/Local/Temp/ray/session_2025-02-02_13-43-52_377343_24792/artifacts/2025-02-02_13-44-02/tune_bert/driver_artifacts
Number of trials: 20/20 (1 ERROR, 18 PENDING, 1 RUNNING)
Number of errored trials: 1
+------------------------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             |   # failures | error file                                                                                                                                                     |
|------------------------+--------------+---------------------------------------------------------------------------------------

[36m(train_func pid=30712)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(train_func pid=30712)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/2346 [00:00<?, ?it/s]
2025-02-02 13:44:19,334	ERROR tune_controller.py:1331 -- Trial task failed for trial train_func_ff99a_00001
Traceback (most recent call last):
  File "c:\Users\Kone\ftllm\yes\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "c:\Users\Kone\ftllm\yes\Lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Kone\ftllm\yes\Lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wra

<IPython.core.display.HTML object>
== Status ==
Current time: 2025-02-02 13:44:23 (running for 00:00:20.96)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/Kone/AppData/Local/Temp/ray/session_2025-02-02_13-43-52_377343_24792/artifacts/2025-02-02_13-44-02/tune_bert/driver_artifacts
Number of trials: 20/20 (2 ERROR, 18 PENDING)
Number of errored trials: 2
+------------------------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             |   # failures | error file                                                                                                                                                     |
|------------------------+--------------+---------------------------------------------------------------

2025-02-02 13:44:24,832	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/ray_temp/tune_bert' in 0.0080s.


== Status ==
Current time: 2025-02-02 13:44:24 (running for 00:00:22.50)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/Kone/AppData/Local/Temp/ray/session_2025-02-02_13-43-52_377343_24792/artifacts/2025-02-02_13-44-02/tune_bert/driver_artifacts
Number of trials: 20/20 (2 ERROR, 18 PENDING)
+------------------------+----------+-----------------+-----------+-----------------+------------------------+------------------------+----------------+
| Trial name             | status   | loc             |   dropout |   learning_rate |   per_device_eval_batc |   per_device_train_bat |   warmup_steps |
|                        |          |                 |           |                 |                 h_size |                ch_size |                |
|------------------------+----------+-----------------+-----------+-----------------+------------------------+----

2025-02-02 13:44:25,851	ERROR tune.py:1037 -- Trials did not complete: [train_func_ff99a_00000, train_func_ff99a_00001]
2025-02-02 13:44:25,852	INFO tune.py:1041 -- Total run time: 23.52 seconds (22.49 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)
- train_func_ff99a_00002: FileNotFoundError('Could not fetch metrics for train_func_ff99a_00002: both result.json and progress.csv were not found at C:/ray_temp/tune_bert/t_ff99a_00002')
- train_func_ff99a_00003: FileNotFoundError('Could not fetch metrics for train_func_ff99a_00003: both result.json and progress.csv were not found at C:/ray_temp/tune_bert/t_ff99a_00003')
- train_func_ff99a_00004: FileNotFoundError('Could not fetch metrics for train_func_ff99a_00004: both result.json and progress.csv were not found at C:/ray_temp/tune_bert/t_ff99a_00004')
- train_func_ff99a_00005: FileNotFoundError('Could not fetch metrics for train_func_ff99a_00005: both result.json and progress.csv were not found at C:/ray_

AttributeError: 'NoneType' object has no attribute 'config'

ModuleNotFoundError: No module named 'ray_tune_init'