In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bhagawat-training-data-ploy-features/training_data_ploy_features.csv
/kaggle/input/final/transformers/default/1/config.json
/kaggle/input/final/transformers/default/1/spiece.model
/kaggle/input/final/transformers/default/1/tokenizer_config.json
/kaggle/input/final/transformers/default/1/model.safetensors
/kaggle/input/final/transformers/default/1/special_tokens_map.json
/kaggle/input/final/transformers/default/1/added_tokens.json
/kaggle/input/final/transformers/default/1/generation_config.json
/kaggle/input/prediction/data_for_prediction_ploy_features.csv


In [3]:
# Here I used google-t5/t5-base model from hugging face
# training for 2 epochs on data training_data_ploy_features.csv

In [None]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
import time

st = time.time()

# Define a dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenizer, data, max_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Construct input text using all columns except 'field'
        input_columns = [col for col in self.data.columns if col != 'field']
        input_text = ' '.join([f"{col}: {row[col]}" for col in input_columns])  # Example input construction

        # Target text is the 'field' column
        target_text = row['field']  # Output is the 'field' column

        # Tokenize inputs and targets
        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer.encode_plus(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        labels = targets["input_ids"].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load your dataset using pandas
data = pd.read_csv("/kaggle/input/training-data-ploy-infrrd/training_data_ploy_features.csv")

# Load the tokenizer and model for conditional generation
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create dataset and dataloader with reduced batch size
train_dataset = CustomDataset(tokenizer, data)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduced batch size

# Optimizer and training settings
optimizer = AdamW(model.parameters(), lr=5e-4)  # Learning rate adjusted
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Mixed precision training setup
scaler = GradScaler()

# Gradient accumulation settings
gradient_accumulation_steps = 2  # Accumulate gradients over 2 batches
accumulated_loss = 0

# Training loop
num_epochs = 2
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    num_batches = len(train_loader)

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Ignore padding tokens in the labels
        labels[labels == tokenizer.pad_token_id] = -100

        with autocast():  # Mixed precision context
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps  # Divide by accumulation steps
            accumulated_loss += loss.item()
        
        # Check for NaN loss and skip the batch if encountered
        if torch.isnan(loss):
            print(f"Skipping batch {batch_idx + 1} due to NaN loss")
            continue

        # Scale the loss and call backward
        scaler.scale(loss).backward()

        # Unscale the gradients and apply gradient clipping after accumulation
        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            scaler.step(optimizer)
            scaler.update()

        # Clear the CUDA cache periodically to free up memory
        if (batch_idx + 1) % 100 == 0:
            torch.cuda.empty_cache()

        # Print progress
        print(f"Batch {batch_idx + 1}/{num_batches}, Loss: {accumulated_loss}")
        accumulated_loss = 0

    # Print average loss for the epoch
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch + 1} Average Loss: {avg_loss}")

    # Save the model and optimizer state after each epoch
    model.save_pretrained(f"trained_t5_model_epoch_{epoch + 1}")
    tokenizer.save_pretrained(f"trained_t5_model_epoch_{epoch + 1}")
    torch.save(optimizer.state_dict(), f"optimizer_state_epoch_{epoch + 1}.pth")
    print(f"Model and optimizer state saved after epoch {epoch + 1}")

# Final save if all epochs complete
model.save_pretrained("trained_t5_model_final")
tokenizer.save_pretrained("trained_t5_model_final")
torch.save(optimizer.state_dict(), "optimizer_state_final.pth")
print("Final model and optimizer state saved.")
end = time.time()
print(f"Training time: {end - st} seconds")


In [None]:
# The result of this is present in eval_metrics_for_2epochs_ploy_features_model.csv this file.

In [None]:
# This is prediction code for this.

In [None]:
import time
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
st = time.time()
# Load the trained model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/model_v4/transformers/default/1")
model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/model_v4/transformers/default/1")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Load the test data
test_data = pd.read_csv("/kaggle/input/testing-data-ploy/testing_data_ploy_features.csv")

# Prepare a function for prediction
def predict_field(row, tokenizer, model, max_len=128):
    # Construct input text using all columns except 'field'
    input_columns = [col for col in test_data.columns if col != 'field']
    input_text = ' '.join([f"{col}: {row[col]}" for col in input_columns])
    
    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        input_text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # Generate prediction
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_len)
    
    # Decode the generated prediction
    predicted_field = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_field

# Apply the function to each row of the test data and create a new column for the predictions
test_data['predicted_field'] = test_data.apply(lambda row: predict_field(row, tokenizer, model), axis=1)

# Save the result to a new CSV file (optional)
test_data.to_csv("test_data_with_ploy_predictions.csv", index=False)

end = time.time()
print(end-st)


In [None]:
df11 = pd.read_csv("/kaggle/working/test_data_with_ploy_predictions.csv")


In [None]:
df11.head()

In [None]:
# After that I again trained my already train model for 3 epochs

In [1]:
# training for 3rd epoch

In [None]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
import time

st = time.time()

# Define a dataset class (same as before)
class CustomDataset(Dataset):
    def __init__(self, tokenizer, data, max_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Construct input text using all columns except 'field'
        input_columns = [col for col in self.data.columns if col != 'field']
        input_text = ' '.join([f"{col}: {row[col]}" for col in input_columns])  # Example input construction

        # Target text is the 'field' column
        target_text = row['field']  # Output is the 'field' column

        # Tokenize inputs and targets
        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer.encode_plus(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        labels = targets["input_ids"].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load your dataset
data = pd.read_csv("/kaggle/input/bhagawat-training-data-ploy-features/training_data_ploy_features.csv")

# Load the saved model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/model_4a/transformers/default/1")
model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/model_4a/transformers/default/1")

# Create dataset and dataloader
train_dataset = CustomDataset(tokenizer, data)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Same batch size

# Set up a new optimizer
optimizer = AdamW(model.parameters(), lr=5e-4)  # Same learning rate as before
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Mixed precision training setup
scaler = GradScaler()

# Gradient accumulation settings
gradient_accumulation_steps = 2
accumulated_loss = 0

# Continue training for 1 more epoch
num_epochs = 1  # One additional epoch
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    num_batches = len(train_loader)

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Ignore padding tokens in the labels
        labels[labels == tokenizer.pad_token_id] = -100

        with autocast():  # Mixed precision context
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps  # Divide by accumulation steps
            accumulated_loss += loss.item()

        # Check for NaN loss and skip the batch if encountered
        if torch.isnan(loss):
            print(f"Skipping batch {batch_idx + 1} due to NaN loss")
            continue

        # Scale the loss and call backward
        scaler.scale(loss).backward()

        # Unscale the gradients and apply gradient clipping after accumulation
        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            scaler.step(optimizer)
            scaler.update()

        # Clear the CUDA cache periodically to free up memory
        if (batch_idx + 1) % 100 == 0:
            torch.cuda.empty_cache()

        # Print progress
        print(f"Batch {batch_idx + 1}/{num_batches}, Loss: {accumulated_loss}")
        accumulated_loss = 0

    # Print average loss for the epoch
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch + 1} Average Loss: {avg_loss}")

    # Save the model and optimizer state after this epoch
    model.save_pretrained(f"trained_t5_model_epoch_{epoch + 3}")  # Save as epoch 3 (continue from previous)
    tokenizer.save_pretrained(f"trained_t5_model_epoch_{epoch + 3}")
    torch.save(optimizer.state_dict(), f"optimizer_state_epoch_{epoch + 3}.pth")
    print(f"Model and optimizer state saved after epoch {epoch + 3}")

# Final save after the additional epoch
model.save_pretrained("trained_t5_model_final_v2")
tokenizer.save_pretrained("trained_t5_model_final_v2")
torch.save(optimizer.state_dict(), "optimizer_state_final_v2.pth")
print("Final model and optimizer state saved.")
end = time.time()
print(f"Training time: {end - st} seconds")


  scaler = GradScaler()
  with autocast():  # Mixed precision context


Epoch 1/1
Batch 1/14851, Loss: 0.004114020615816116
Batch 2/14851, Loss: 0.0600995272397995
Batch 3/14851, Loss: 0.0009119971073232591
Batch 4/14851, Loss: 0.001110713928937912
Batch 5/14851, Loss: 0.005726331379264593
Batch 6/14851, Loss: 0.007934951223433018
Batch 7/14851, Loss: 0.001703263376839459
Batch 8/14851, Loss: 0.0009576889569871128
Batch 9/14851, Loss: 0.003997235093265772
Batch 10/14851, Loss: 5.385403710533865e-05
Batch 11/14851, Loss: 0.0013376415008679032
Batch 12/14851, Loss: 0.006538925226777792
Batch 13/14851, Loss: 0.03696290776133537
Batch 14/14851, Loss: 0.007530923932790756
Batch 15/14851, Loss: 0.008077082224190235
Batch 16/14851, Loss: 0.0008829523576423526
Batch 17/14851, Loss: 0.03663649782538414
Batch 18/14851, Loss: 0.013496718369424343
Batch 19/14851, Loss: 0.025220854207873344
Batch 20/14851, Loss: 0.011424766853451729
Batch 21/14851, Loss: 0.0006425242172554135
Batch 22/14851, Loss: 0.003217418910935521
Batch 23/14851, Loss: 0.002208355814218521
Batch 24

In [None]:
# The result of this varient of model is in eval_metrics_after_runing_3_epoch.csv this file

In [None]:
import time
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Set up timing
st = time.time()

# Load the trained model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/final/transformers/default/1")
model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/final/transformers/default/1")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Load the test data
test_data = pd.read_csv("/kaggle/input/prediction/data_for_prediction_ploy_features.csv")

# Prepare the input text
def prepare_input_text(row, input_columns):
    return ' '.join([f"{col}: {row[col]}" for col in input_columns])

# Batch predictions for efficiency
def predict_batch(input_texts, tokenizer, model, max_len=128):
    inputs = tokenizer(
        input_texts,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    ).to(device)
    
    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_len)
    
    # Decode predictions
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Define the columns to use for input text
input_columns = [col for col in test_data.columns if col != 'field']

# Batch size for predictions
batch_size = 1024
predictions = []

for i in range(0, len(test_data), batch_size):
    batch_data = test_data.iloc[i:i+batch_size]
    input_texts = [prepare_input_text(row, input_columns) for _, row in batch_data.iterrows()]
    
    # Predict for the current batch
    batch_predictions = predict_batch(input_texts, tokenizer, model)
    predictions.extend(batch_predictions)

# Add predictions to DataFrame
test_data['predicted_field'] = predictions

# Save the result to a new CSV file (optional)
test_data.to_csv("test_data_with_ploy_predictions.csv", index=False)

end = time.time()
print(f"Time taken: {end - st} seconds")

In [None]:
# Due internet disconnect output of cell in this file got erased