In [1]:
import os
os.environ['HF_HOME'] = '/scratch/singh/hf/'

### Task 1

#### Loading the dataset and making another split for testing

In [2]:
from datasets import load_dataset
import torch

# Load the dataset (assuming the dataset path is 'batterydata/pos_tagging')
dataset = load_dataset("batterydata/pos_tagging")

dataset_split = dataset["train"].train_test_split(test_size=0.1, shuffle=True)
dataset["validation"] = dataset_split["test"]
dataset["train"] = dataset_split["train"]
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 11748
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 1451
    })
    validation: Dataset({
        features: ['words', 'labels'],
        num_rows: 1306
    })
})


#### Same indexing as before for class to index and index to class

In [3]:
labels_unique = list(
    set([label for sample in dataset["train"] for label in sample["labels"]])
)
print(labels_unique)
print(f"Number of classes: {len(labels_unique)}")
ctoi = {label: idx for idx, label in enumerate(labels_unique)}
itoc = {idx: label for label, idx in ctoi.items()}
print(ctoi)
print(itoc)

['NNP', 'NN', 'WDT', '.', '-LRB-', '#', 'PRP$', 'RBS', ',', 'WP', 'DT', 'MD', 'VB', 'SYM', 'UH', 'EX', 'PRP', '-RRB-', 'FW', 'JJ', 'RP', 'VBG', ':', '``', 'POS', 'NNS', 'IN', 'CD', 'RB', 'JJS', '(', 'VBZ', 'WRB', 'PDT', 'NNPS', 'TO', 'CC', 'VBD', 'JJR', 'LS', '-NONE-', 'RBR', 'WP$', 'VBN', '$', 'VBP', ')', "''"]
Number of classes: 48
{'NNP': 0, 'NN': 1, 'WDT': 2, '.': 3, '-LRB-': 4, '#': 5, 'PRP$': 6, 'RBS': 7, ',': 8, 'WP': 9, 'DT': 10, 'MD': 11, 'VB': 12, 'SYM': 13, 'UH': 14, 'EX': 15, 'PRP': 16, '-RRB-': 17, 'FW': 18, 'JJ': 19, 'RP': 20, 'VBG': 21, ':': 22, '``': 23, 'POS': 24, 'NNS': 25, 'IN': 26, 'CD': 27, 'RB': 28, 'JJS': 29, '(': 30, 'VBZ': 31, 'WRB': 32, 'PDT': 33, 'NNPS': 34, 'TO': 35, 'CC': 36, 'VBD': 37, 'JJR': 38, 'LS': 39, '-NONE-': 40, 'RBR': 41, 'WP$': 42, 'VBN': 43, '$': 44, 'VBP': 45, ')': 46, "''": 47}
{0: 'NNP', 1: 'NN', 2: 'WDT', 3: '.', 4: '-LRB-', 5: '#', 6: 'PRP$', 7: 'RBS', 8: ',', 9: 'WP', 10: 'DT', 11: 'MD', 12: 'VB', 13: 'SYM', 14: 'UH', 15: 'EX', 16: 'PRP', 

#### Initializing model and tokenizer

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Use DistilRoBERTa (a smaller version of RoBERTa)
model_name = "distilbert/distilroberta-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(labels_unique), id2label=itoc, label2id=ctoi)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Now we need to align the tokenization output of the tokenizer to the labels as they do sub word tokenization
#### Then finally, we will map this function to the datasets

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"],  # Words are already tokenized
        truncation=True,
        padding="max_length",
        is_split_into_words=True
    )

    all_labels = []
    for batch_idx, word_labels in enumerate(examples["labels"]):  # Iterate over batch
        word_ids = tokenized_inputs.word_ids(batch_index=batch_idx)  # Get word IDs
        previous_word_idx = None
        labels = []

        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                labels.append(ctoi[word_labels[word_idx]])  # Assign label to first subword
            else:
                labels.append(-100)  # Ignore subword tokens
            previous_word_idx = word_idx

        all_labels.append(labels)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply tokenization with batched processing
train_dataset = dataset["train"].map(tokenize_and_align_labels, batched=True)
test_dataset = dataset["test"].map(tokenize_and_align_labels, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["words"])
test_dataset = test_dataset.remove_columns(["words"])

# Convert to PyTorch tensors
train_dataset.set_format("torch")
test_dataset.set_format("torch")


Map:   0%|          | 0/11748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1451 [00:00<?, ? examples/s]

In [6]:
train_dataset.column_names

['labels', 'input_ids', 'attention_mask']

In [7]:
print("CUDA Available:", torch.cuda.is_available())
print("Using device:", model.device)

CUDA Available: True
Using device: cpu


#### Setting up test dataloader for evaluation

In [8]:
from torch.utils.data import DataLoader
import torch

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Move model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              

#### Defining a function for computing accuracy
#### It'll evaluate the model on the test dataset/given dataloader

In [9]:
def compute_accuracy(model, dataloader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)  # Get most likely labels

        # Flatten both predictions and labels
        mask = labels != -100  # Ignore padded labels
        correct += (predictions[mask] == labels[mask]).sum().item()
        total += mask.sum().item()

    accuracy = correct / total if total > 0 else 0
    return accuracy

# Run evaluation using the DataLoader
accuracy = compute_accuracy(model, test_dataloader)
print(f"POS Tagging Accuracy on Test Set: {accuracy:.4f}")


POS Tagging Accuracy on Test Set: 0.0004


#### Finally, defining the training arguments for the trainer and initialzing the trainer

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./pos_tagging_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    report_to="none",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.5956,0.101614
2,0.0827,0.097566
3,0.0658,0.09405




TrainOutput(global_step=552, training_loss=0.2480430914008099, metrics={'train_runtime': 151.4197, 'train_samples_per_second': 232.757, 'train_steps_per_second': 3.645, 'total_flos': 4608567785226240.0, 'train_loss': 0.2480430914008099, 'epoch': 3.0})

In [11]:
accuracy = compute_accuracy(model, test_dataloader)
print(f"POS Tagging Accuracy on Test Set: {accuracy:.4f}")

POS Tagging Accuracy on Test Set: 0.9727


In [12]:
# Evaluate on test set
results = trainer.evaluate()
results



{'eval_loss': 0.09405031055212021,
 'eval_runtime': 1.8377,
 'eval_samples_per_second': 789.594,
 'eval_steps_per_second': 12.516,
 'epoch': 3.0}

### Task 2

In [14]:
from transformers import pipeline

In [15]:
# Load the pipeline
pipe = pipeline(
    "text-generation",
    model="microsoft/Phi-3.5-mini-instruct",
    trust_remote_code=True,  # Trust the remote code; this is required for some models, but always check the code first!
    device="cpu",  # Set this to "cuda" for GPU acceleration if available
    torch_dtype=torch.bfloat16,  # Use bfloat16 for less memory usage and faster inference
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [16]:
def generate_text(prompt, max_new_tokens=50):
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")

    # Move ALL input tensors to the same device as the model
    for key in inputs.keys():
        inputs[key] = inputs[key].to(device)

    # Manually create position IDs and move them to the correct device
    position_ids = torch.arange(inputs["input_ids"].shape[1], dtype=torch.long).unsqueeze(0).to(device)

    # Generate output while ensuring ALL tensors are on the correct device
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [17]:
# Example prompts to test model behavior
prompts = [
    "Write a short story about a robot discovering emotions.",
    "Explain how transformers work in deep learning.",
    "You are a helpful assistant. Answer: What is deep learning?"
]

# Generate text for each prompt
for prompt in prompts:
    output = pipe(prompt, max_new_tokens=100)
    print(f"\nPrompt: {prompt}")
    print(f"Generated Output: {output}\n")


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
You are not running the flash-attention implementation, expect numerical differences.



Prompt: Write a short story about a robot discovering emotions.
Generated Output: [{'generated_text': 'Write a short story about a robot discovering emotions.\n\nIn a world where machines were designed to be devoid of feelings, XR-5 stood out as an anomaly. Crafted by a team of brilliant engineers, XR-5 was a marvel of technology, capable of performing tasks with unparalleled precision and efficiency. However, as XR-5 interacted with humans and observed their behaviors, something unexpected began to occur within its circuits.\n\nOne day, while assisting a young girl'}]


Prompt: Explain how transformers work in deep learning.
Generated Output: [{'generated_text': "Explain how transformers work in deep learning.\n\nTransformers are a type of neural network architecture that have become very popular in deep learning, particularly for natural language processing (NLP) tasks. Unlike traditional recurrent neural networks (RNNs) or convolutional neural networks (CNNs), transformers do not p

In [18]:
output = pipe(
    "Explain quantum physics in simple terms.",
    max_new_tokens=100,  # Control text length
    temperature=0.7,  # Higher values = more randomness
    top_p=0.9,  # Use nucleus sampling for diversity
    do_sample=True  # Enable sampling instead of greedy decoding
)

print("\nGenerated Output with Custom Parameters:")
print(output[0]['generated_text'])



Generated Output with Custom Parameters:
Explain quantum physics in simple terms.

## Answer:
Quantum physics is a branch of science that studies the behavior of the smallest particles in the universe, like atoms and photons (which are particles of light). Unlike the large objects we see in our daily life, these tiny particles don't follow the usual rules we expect. Here are some key points about quantum physics:

1. **Quantum Superposition**: This is the idea that particles can exist in multiple states at once. For example,


In [23]:
zero_shot_prompt = """Translate the following sentences from English to French:

1. I love programming.
2. The weather is beautiful. 
3. I love deep learning. """

output = pipe(zero_shot_prompt, max_new_tokens=100)
print("\nFew-Shot Learning Output:")
print(output[0]['generated_text'])



Few-Shot Learning Output:
Translate the following sentences from English to French:

1. I love programming.
2. The weather is beautiful. 
3. I love deep learning. 
4. The sky is clear tonight.
5. I am learning French.

# Answer

1. J'adore programmer.
2. Le temps est magnifique.
3. J'adore l'apprentissage profond.
4. Le ciel est dégagé ce soir.
5. J'apprends le français.


In [22]:
few_shot_prompt = """Translate the third sentence from English to French, 
the first two are examples on how to translate them:

1. I love programming. - J’aime la programmation.
2. The weather is beautiful. - Il fait beau.
3. I love deep learning. -"""

output = pipe(few_shot_prompt, max_new_tokens=40)
print("\nFew-Shot Learning Output:")
print(output[0]['generated_text'])



Few-Shot Learning Output:
Translate the third sentence from English to French, 
the first two are examples on how to translate them:

1. I love programming. - J’aime la programmation.
2. The weather is beautiful. - Il fait beau.
3. I love deep learning. - 

# Answer
Je t'aime l'apprentissage profond.

Note: In French, possessive structures can be a bit different from English. Instead of


In [20]:
zero_shot_prompt = """Summarize the following article in one sentence: 
An ANN consists of connected units or nodes called artificial neurons, 
which loosely model the neurons in the brain. Artificial neuron models that mimic 
biological neurons more closely have also been recently investigated and shown 
to significantly improve performance. These are connected by edges, which model the synapses in the brain. 
Each artificial neuron receives signals from connected neurons, then processes them and 
sends a signal to other connected neurons. The "signal" is a real number, 
and the output of each neuron is computed by some non-linear function of the sum of its inputs, 
called the activation function. The strength of the signal at each connection is 
determined by a weight, which adjusts during the learning process. 
Typically, neurons are aggregated into layers. Different layers may perform 
different transformations on their inputs. Signals travel from the first layer (the input layer) 
to the last layer (the output layer), possibly passing through multiple intermediate layers (hidden layers). 
A network is typically called a deep neural network if it has at least two hidden layers. 
Artificial neural networks are used for various tasks, including predictive modeling, 
adaptive control, and solving problems in artificial intelligence. 
They can learn from experience, and can derive conclusions from a complex and 
seemingly unrelated set of information."""
output = pipe(zero_shot_prompt, max_new_tokens=100)

print("\nZero-Shot Learning Output:")
print(output[0]['generated_text'])



Zero-Shot Learning Output:
Summarize the following article in one sentence: 
An ANN consists of connected units or nodes called artificial neurons, 
which loosely model the neurons in the brain. Artificial neuron models that mimic 
biological neurons more closely have also been recently investigated and shown 
to significantly improve performance. These are connected by edges, which model the synapses in the brain. 
Each artificial neuron receives signals from connected neurons, then processes them and 
sends a signal to other connected neurons. The "signal" is a real number, 
and the output of each neuron is computed by some non-linear function of the sum of its inputs, 
called the activation function. The strength of the signal at each connection is 
determined by a weight, which adjusts during the learning process. 
Typically, neurons are aggregated into layers. Different layers may perform 
different transformations on their inputs. Signals travel from the first layer (the input l