In [None]:
import collections
import numpy as np
from transformers import GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.corpus import gutenberg
from collections import Counter
import nltk
from datasets import load_dataset
import deepspeed
from transformers import AdamW, get_linear_schedule_with_warmup
from deepspeed.ops.adam import FusedAdam

import torch
import torch.nn.functional as F
import torch.nn as nn

[2023-12-01 13:57:22,098] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
class DilatedAttention(nn.Module):
    def __init__(self, embed_size, heads, segment_length, dilation_rate):
        super(DilatedAttention, self).__init__()
        self.heads = heads
        self.head_dim = embed_size // heads
        self.segment_length = segment_length
        self.dilation_rate = dilation_rate

        assert embed_size % heads == 0, "Embed size must be divisible by number of heads"

        self.values = nn.Linear(self.head_dim, self.head_dim)
        self.keys = nn.Linear(self.head_dim, self.head_dim)
        self.queries = nn.Linear(self.head_dim, self.head_dim)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query):
        N = query.shape[0]

        # Split the embedding into self.heads different pieces
        values = self.split_heads(self.values(values), N)
        keys = self.split_heads(self.keys(keys), N)
        queries = self.split_heads(self.queries(query), N)

        # Sparsify (dilate) the segments
        values = self.dilate_segments(values)
        keys = self.dilate_segments(keys)
        queries = self.dilate_segments(queries)

        # Scaled Dot-Product Attention
        attention = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        attention = F.softmax(attention / (self.head_dim ** 0.5), dim=-1)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, -1, self.heads * self.head_dim
        )

        return self.fc_out(out)

    def split_heads(self, x, batch_size):
        # Split the last dimension into (heads, head_dim)
        return x.view(batch_size, -1, self.heads, self.head_dim).transpose(1, 2)

    def dilate_segments(self, tensor):
        # Implement the dilation logic
        batch_size, heads, seq_length, head_dim = tensor.shape
        new_seq_length = seq_length // self.dilation_rate  # Adjusted for dilation
        dilated_tensor = torch.zeros(batch_size, heads, new_seq_length, head_dim, device=tensor.device)

        for i in range(0, seq_length, self.dilation_rate):
            dilated_tensor[:, :, i // self.dilation_rate, :] = tensor[:, :, i, :]

        return dilated_tensor

class MultiHeadDilatedAttention(nn.Module):
    def __init__(self, embed_size, num_heads, segment_length, dilation_rates):
        super(MultiHeadDilatedAttention, self).__init__()
        self.heads = nn.ModuleList([])
        self.embed_size = embed_size
        self.num_heads = num_heads

        for dilation_rate in dilation_rates:
            # Each head will have its own unique dilation rate
            self.heads.append(DilatedAttention(embed_size, 1, segment_length, dilation_rate))

        # Output linear layer to combine the heads
        self.fc_out = nn.Linear(embed_size * len(dilation_rates), embed_size)

    def forward(self, values, keys, query):
        attention_outs = [head(values, keys, query) for head in self.heads]

        # Concatenate outputs from different heads
        out = torch.cat(attention_outs, dim=-1)

        # Final linear layer to project back to the original embedding size
        out = self.fc_out(out)

        return out
#my added cachedMemory bank class here
class CachedMemoryBank(nn.Module):
    def __init__(self, vocab_size, embedding_dim, memory_dim):
        super(CachedMemoryBank, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.memory_key = nn.Linear(embedding_dim, memory_dim)
        self.memory_value = nn.Linear(embedding_dim, memory_dim)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        keys = self.memory_key(embedded)
        values = self.memory_value(embedded)
        return keys, values
#ended here

class TransformerBlock(nn.Module):
    def __init__(self, hidden_size, num_heads, feedforward_size, memory_bank, dilation=1, dropout_rate=0.1, dilation_rate=[1,2,3,4]):
        super(TransformerBlock, self).__init__()

        # Multi-head dilated self-attention layer
        self.dilated_attention = MultiHeadDilatedAttention(hidden_size, num_heads, dilation, dilation_rate)

        # Layer normalization
        self.norm1 = nn.LayerNorm(hidden_size)

        # Position-wise feedforward network
        self.feedforward = nn.Sequential(
            nn.Linear(hidden_size, feedforward_size),
            nn.ReLU(),
            nn.Linear(feedforward_size, hidden_size)
        )

        # Layer normalization
        self.norm2 = nn.LayerNorm(hidden_size)

        # Dropout
        self.dropout = nn.Dropout(dropout_rate)

        # Memory Bank
        self.memory_bank = memory_bank  # <-- Check this line

        # Create an instance of CachedMemoryBank
        memory_dim = 2048
        self.memory_bank = CachedMemoryBank(vocab_size, embedding_dim, memory_dim)

    def forward(self, inputs, attention_mask=None):

        keys, values = self.memory_bank(input_ids)  # Adjust this line according to your implementation
        attention_output = self.dilated_attention(keys, values, inputs)
        attention_output = self.dropout(attention_output) + inputs
        attention_output = self.norm1(attention_output)

        # Position-wise feedforward
        ff_output = self.feedforward(attention_output)
        ff_output = self.dropout(ff_output) + attention_output
        ff_output = self.norm2(ff_output)

        return ff_output



class LargeLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads, feedforward_size, dilation=1, dropout_rate=0.1, max_sequence_length=512):
        super(LargeLanguageModel, self).__init__()

        self.max_sequence_length = max_sequence_length
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(hidden_dim, num_heads, feedforward_size, memory_bank=CachedMemoryBank(vocab_size, embedding_dim, hidden_dim), dilation=dilation, dropout_rate=dropout_rate)
            for _ in range(num_layers)
        ])

        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        embedded = self.embedding(input_ids)
        transformer_output = embedded

        for block in self.transformer_blocks:
            transformer_output = block(transformer_output, attention_mask)

        # Reshape the transformer output before passing through the linear layer
        batch_size, seq_length, hidden_dim = transformer_output.size()
        transformer_output = transformer_output.view(batch_size * seq_length, hidden_dim)

        logits = self.linear(transformer_output)

        return logits

    def generate_text(self, input_ids, max_length, temperature=1.0, top_k=None, top_p=None):
        # Clone the input_ids to avoid modifying the original
        generated_ids = input_ids.clone()

        # Loop to generate text up to max_length
        for _ in range(max_length):
            # Forward pass to get logits for the next token
            logits = self.forward(generated_ids)

            # Apply temperature for token sampling
            logits = logits[-1, :] / temperature

            # Sampling logic based on top_k and top_p
            if top_k is not None:
                # Apply top-k sampling
                logits, indices = torch.topk(logits, top_k)
                probs = torch.softmax(logits, dim=-1)
                predicted_id = torch.multinomial(probs, num_samples=1).squeeze()
            elif top_p is not None:
                # Apply nucleus (top-p) sampling
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1]
                sorted_indices_to_remove[:, 0] = 0
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = float('-inf')
                probs = torch.softmax(logits, dim=-1)
                predicted_id = torch.multinomial(probs, num_samples=1).squeeze()
            else:
                # Regular softmax-based sampling
                probs = torch.softmax(logits, dim=-1)
                predicted_id = torch.multinomial(probs, num_samples=1).squeeze()

            # Append the predicted_id to generated_ids
            generated_ids = torch.cat((generated_ids, predicted_id.unsqueeze(0).unsqueeze(0)), dim=1)

            # Check if the generated token is the end token
            if predicted_id == self.vocab_size - 1:
                break

        return generated_ids


class CustomTokenizer:
    def __init__(self, corpus_file, max_vocab_size=5000, special_tokens=None):
        self.corpus_file = corpus_file
        self.max_vocab_size = max_vocab_size
        self.special_tokens = special_tokens or ['<PAD>', '<UNK>', '<START>', '<END>']
        self.vocab = {}
        self.reverse_vocab = {}
        self.build_vocab()

    def build_vocab(self):
        word_counter = collections.Counter()
        special_tokens = self.special_tokens

        with open(self.corpus_file, 'r', encoding='utf-8') as file:
            for line in file:
                words = line.strip().split()
                word_counter.update(words)

        most_common = word_counter.most_common(self.max_vocab_size - len(special_tokens))

        self.vocab = {word: idx + len(special_tokens) for idx, (word, _) in enumerate(most_common)}
        self.reverse_vocab = {idx: word for word, idx in self.vocab.items()}

        for idx, token in enumerate(special_tokens):
            self.vocab[token] = idx
            self.reverse_vocab[idx] = token

    def encode(self, text, add_special_tokens=True):
        tokens = text.strip().split()
        if add_special_tokens:
            tokens = ['<START>'] + tokens + ['<END>']
        token_ids = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
        return token_ids

    def decode(self, token_ids, skip_special_tokens=True):
        tokens = [self.reverse_vocab.get(token_id, '<UNK>') for token_id in token_ids]
        if skip_special_tokens:
            tokens = [token for token in tokens if token not in ['<PAD>', '<START>', '<END>']]
        return ' '.join(tokens)

    def pad_sequences(self, sequences, max_length, padding_token='<PAD>'):
        padded_sequences = []
        for seq in sequences:
            if len(seq) < max_length:
                seq += [self.vocab.get(padding_token, self.vocab['<UNK>'])] * (max_length - len(seq))
            padded_sequences.append(seq)
        return padded_sequences



# Instantiate the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

class LanguageModelDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_sequence_length, batch_size, sample_size):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length
        self.batch_size = batch_size
        self.sample_size = sample_size

    def DataTokenization(self, text, max_sequence_length):
        value = self.tokenizer.encode(text, add_special_tokens=False)
        if len(value) > max_sequence_length:
            value = value[:max_sequence_length]
        else:
            value = value + [self.tokenizer.pad_token_id] * (max_sequence_length - len(value))
        return value

    def TrainDataPreprocess(self, max_sequence_length, sample_size):
        # Extract the columns 'system_prompt', 'question', and 'response'
        system_prompts = self.dataset['train']['system_prompt']
        questions = self.dataset['train']['question']
        responses = self.dataset['train']['response']

        # Preprocess and pad the sequences
        preprocessed_data = []
        for i in range(sample_size):
            submax_sequence_length = max_sequence_length // 2
            padded_system_prompt = self.DataTokenization(system_prompts[i], submax_sequence_length)
            padded_question = self.DataTokenization(questions[i], submax_sequence_length)
            padded_response = self.DataTokenization(responses[i], max_sequence_length)
            preprocessed_data.append({
                'system_prompt': padded_system_prompt,
                'question': padded_question,
                'response': padded_response
            })

        # Convert the preprocessed data to tensors
        system_prompt_tensors = torch.tensor([item['system_prompt'] for item in preprocessed_data], dtype=torch.long)
        question_tensors = torch.tensor([item['question'] for item in preprocessed_data], dtype=torch.long)
        response_tensors = torch.tensor([item['response'] for item in preprocessed_data], dtype=torch.long)

        # Concatenate system_prompts and questions along the appropriate dimension
        input_ids = torch.cat((system_prompt_tensors, question_tensors), dim=1)
        response_tensors = response_tensors.to(dtype=torch.long)

        # Create DataLoader with the preprocessed tensors
        data = torch.utils.data.TensorDataset(input_ids, response_tensors)
        return data

    def TestDataPreprocess(self, max_sequence_length, sample_size):
        # Extract the columns 'system_prompt', 'question', and 'response'
        system_prompts = self.dataset['test']['system_prompt']
        questions = self.dataset['test']['question']
        responses = self.dataset['test']['response']

        # Preprocess and pad the sequences
        preprocessed_data = []
        for i in range(sample_size):
            submax_sequence_length = max_sequence_length // 2
            padded_system_prompt = self.DataTokenization(system_prompts[i], submax_sequence_length)
            padded_question = self.DataTokenization(questions[i], submax_sequence_length)
            padded_response = self.DataTokenization(responses[i], max_sequence_length)
            preprocessed_data.append({
                'system_prompt': padded_system_prompt,
                'question': padded_question,
                'response': padded_response
            })

        # Convert the preprocessed data to tensors
        system_prompt_tensors = torch.tensor([item['system_prompt'] for item in preprocessed_data], dtype=torch.long)
        question_tensors = torch.tensor([item['question'] for item in preprocessed_data], dtype=torch.long)
        response_tensors = torch.tensor([item['response'] for item in preprocessed_data], dtype=torch.long)

        # Concatenate system_prompts and questions along the appropriate dimension
        input_ids = torch.cat((system_prompt_tensors, question_tensors), dim=1)
        response_tensors = response_tensors.to(dtype=torch.long)

        # Create DataLoader with the preprocessed tensors
        data = torch.utils.data.TensorDataset(input_ids, response_tensors)
        return data

    def DataDivision(self, data, test_size=0.2, random_state=42):
        # Split the data into training and validation sets
        train_data, val_data = train_test_split(data, test_size=test_size, random_state=random_state)

        # Create DataLoader for training data
        train_dataloader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)

        # Create DataLoader for validation data
        validation_dataloader = DataLoader(val_data, batch_size=self.batch_size, shuffle=False)
        return train_dataloader, validation_dataloader

    def DataDivision2(self, data, test_size=0.2, random_state=42):
        # Split the data into training and validation sets
        train_data, val_data = train_test_split(data, test_size=test_size, random_state=random_state)

    #     # Create DataLoader for training data
    #     train_dataloader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)

    #     # Create DataLoader for validation data
    #     validation_dataloader = DataLoader(val_data, batch_size=self.batch_size, shuffle=False)
        return train_data, val_data


dataset = load_dataset("shirsh10mall/LLM_Instruct_Learning_Project_Preprocessed_Tokenized_Open_Orca_Dataset_Flan_T5")

In [None]:
embed_size = 768  # Example embedding size
num_heads = 4     # Number of heads
segment_length = 16  # Segment length for dilated attention
dilation_rates = [1, 2, 3, 4]  # Dilation rates for each head

# Ensure the length of dilation_rates matches num_heads
assert len(dilation_rates) == num_heads, "Length of dilation_rates must be equal to num_heads"

# Create an instance of MultiHeadDilatedAttention
multi_head_dilated_attention = MultiHeadDilatedAttention(embed_size, num_heads, segment_length, dilation_rates)

In [None]:
# Assuming you have a LanguageModelDataset and LargeLanguageModel classes defined somewhere
max_sequence_length = 4096
sample_size = 230318
batch_size = 3  # Adjusted for 3 GPUs
LDS = LanguageModelDataset(dataset, tokenizer, max_sequence_length, batch_size, sample_size)

# Preprocess the data
data = LDS.TrainDataPreprocess(max_sequence_length, sample_size)
train_dataset, validation_dataset = LDS.DataDivision2(data, test_size=0.2, random_state=42)

Token indices sequence length is longer than the specified maximum sequence length for this model (1435 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# Create an instance of your model
vocab_size = 2048
embedding_dim = 768
hidden_dim = 768
num_layers = 12
num_heads = 12
feedforward_size = 2048
dilation = 4
dropout_rate = 0.1
max_sequence_length = 2048
model = LargeLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, num_heads, feedforward_size, dilation, dropout_rate, max_sequence_length)
# model2= DeepSpeedTrainer(model=model,train_dataloader=train_dataloader,)

In [None]:
# DeepSpeed setup
deepspeed_config = {
    "fp32": {
        "enabled": True,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 0.001,
            "betas": (0.9, 0.99),
            "eps": 1e-8,
            "weight_decay": 3e-7
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 0.001,
            "warmup_num_steps": 1000
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": True,
            "buffer_count": 5,
            "buffer_size": 1e8,
            "max_in_cpu": 1e9
        },
        "overlap_comm": True,
        "contiguous_gradients": True,
        "sub_group_size": 1e8,
        "reduce_bucket_size": 2e8,
        "stage3_prefetch_bucket_size": 1e8,
        "stage3_param_persistence_threshold": 1e7,
        "stage3_max_live_parameters": 3e8,
        "stage3_max_reuse_distance": 2e7,
        "stage3_gather_16bit_weights_on_model_save": True
    },

    "gradient_accumulation_steps": 3,
    "gradient_clipping": 1.0,
    "steps_per_print": 2000,
    "train_batch_size": 9,
    "train_micro_batch_size_per_gpu": 3,
    "wall_clock_breakdown": False,

    # Include the data_types configuration
#     "data_types": {
#         "grad_accum_dtype": ["fp32", "fp16", "bf16"]
#     }
}

# Initialize DeepSpeed\
#Returns:
# A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
deep_model,deep_optimizer, deep_train_loader, _ = deepspeed.initialize(model=model, training_data=train_dataset, config=deepspeed_config,)

[2023-12-01 14:04:35,086] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.12.3, git-hash=unknown, git-branch=unknown
[2023-12-01 14:04:35,088] [INFO] [comm.py:637:init_distributed] cdb=None
[2023-12-01 14:04:35,089] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2023-12-01 14:04:36,227] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.17.0.2, master_port=29500
[2023-12-01 14:04:36,230] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.


RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).

In [None]:
len(train_dataset),len(deep_train_loader)

In [None]:
deep_model,deep_train_loader,deep_optimizer

In [None]:
len(deep_train_loader)

In [None]:
for iteration, batch in enumerate(deep_train_loader):
    print(batch)
    print('----tttt----')
    if iteration>2:
        break

In [None]:
batch2=next(deep_train_loader)
batch2

In [None]:
batch2=next(deep_train_loader)
batch2[0]

In [None]:
len(batch2)

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss()

In [None]:
### New Training Loop
#latest one training
# Continue with the rest of your training loop
num_epochs = 2
# Move the model to the GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

for epoch in range(num_epochs):

#     batch_n=next(deep_train_loader)

    for iteration, batch in enumerate(deep_train_loader):
        print("We are in the training loop !!!!!!!!!!!!!!!!!!!!!!!!!")
        input_ids, response_tensor= batch
        input_ids=input_ids.to(device)
        response_tensor=response_tensor.to(device)

        # Forward pass
        logits = deep_model(input_ids)

        print("WEEEEEEEEEEEEEEEEEEEEEEEE ARE HEREEEEEEEE!!!!!!!!!!!!!!!!!!")
        loss = criterion(logits.view(-1, logits.size(-1)), response_tensor.view(-1))
        print(f'Epoch {epoch+1} Step {iteration+1} | Loss Calculated : {loss}')

        #     loss.backward()
        # Perform any additional DeepSpeed engine step
        deep_model.backward(loss)

        ### 1 ### Use model.step() instead of optimizer.step()
        deep_model.step()
        ### 2 ### deep_optimizer.step()

#         # Backward pass
#         model.zero_grad()
#         loss.backward()

#         # Update parameters
#         model.step()

    # Validation after each epoch
#     model.eval()
#     with torch.no_grad():
#         for val_batch in validation_dataloader:
#             val_input_ids = val_batch[0].to(device)
#             val_response_tensor = val_batch[1].to(device)

#             val_logits = model(val_input_ids)

#             # Compute validation loss
#             val_loss = model(val_logits, val_response_tensor)

#     print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")

# Save the final model
# filepath = 'trained_model.pth'
# torch.save(model.state_dict(), filepath)


We are in the training loop !!!!!!!!!!!!!!!!!!!!!!!!!


../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [118,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
#Currently not  using this
# Move the model to the GPU
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = model.to(device)

# Continue with the rest of your training loop
num_epochs = 10
# Move the model to the GPU
# device = "cuda" if torch.cuda.is_available() else "cpu"
for epoch in range(num_epochs):
#     model.train()
    for batch in train_dataset:
        input_ids = batch[0]#.to(device)
        print(input_ids)
        response_tensor = batch[1]#.to(device)
        #input_ids=input_ids.to(device)
        #response_tensor=response_tensor.to(device)

        # Forward pass
        logits = model(input_ids)

        print("WEEEEEEEEEEEEEEEEEEEEEEEE ARE HEREEEEEEEE!!!!!!!!!!!!!!!!!!")
        loss = criterion(logits.view(-1, logits.size(-1)), response_tensor.view(-1))

        # Perform any additional DeepSpeed engine step
        model.backward(loss)

        model.step()  # Use model.step() instead of optimizer.step()

#         # Assuming you have a loss function defined
#         loss = model(logits, response_tensor)

#         # Backward pass
#         model.zero_grad()
#         loss.backward()

#         # Update parameters
#         model.step()

    # Validation after each epoch
#     model.eval()
#     with torch.no_grad():
#         for val_batch in validation_dataloader:
#             val_input_ids = val_batch[0].to(device)
#             val_response_tensor = val_batch[1].to(device)

#             val_logits = model(val_input_ids)

#             # Compute validation loss
#             val_loss = model(val_logits, val_response_tensor)

    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")

# Save the final model
# filepath = 'trained_model.pth'
# torch.save(model.state_dict(), filepath)


In [None]:
print(input_ids)

In [None]:
response_tensor

In [None]:
!pip install --upgrade torch
