# Initial code

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import (
    BertTokenizer, 
    BertForMaskedLM, 
    BertForQuestionAnswering,
    Trainer, 
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset



In [None]:
def load_articles(directory_path):
    """
    Load all text files from a directory
    """
    articles = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
                articles.append(file.read())
    
    return articles



In [None]:
def prepare_mlm_dataset(chunks, tokenizer, max_length=512):
    """
    Prepare dataset for Masked Language Model (MLM) training.
    """
    encodings = tokenizer(
        chunks,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Convert to Dataset
    dataset = Dataset.from_dict({
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": encodings["input_ids"].clone()
    })
    return dataset



In [None]:
from transformers import BertTokenizer
from datasets import Dataset
import pandas as pd

def prepare_qa_dataset(qa_csv_path, tokenizer, max_length=512):
    """
    Prepare dataset for Question Answering training.
    """
    # Load QA data
    qa_data = pd.read_csv(qa_csv_path)

    # Ensure no empty question or answer
    qa_data = qa_data.dropna(subset=['Question', 'Answer'])
    
    # Tokenize questions and answers
    encodings = tokenizer(
        qa_data['Question'].tolist(),  # list of questions
        qa_data['Answer'].tolist(),  # list of answers
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors='pt',  # Returns PyTorch tensors
        return_token_type_ids=True,
        return_attention_mask=True
    )

    # Prepare start and end positions for answers
    start_positions = []
    end_positions = []

    for i, (question, answer) in enumerate(zip(qa_data['Question'], qa_data['Answer'])):
        # Tokenize question and answer together
        question_answer = tokenizer.encode(question + " " + answer, add_special_tokens=True)
        
        # Find the position of the answer in the question + answer tokens
        try:
            start_idx = question_answer.index(tokenizer.encode(answer, add_special_tokens=False)[0])
            end_idx = start_idx + len(tokenizer.encode(answer, add_special_tokens=False)) - 1
        except ValueError:
            # In case the answer is not found, fallback (e.g., padding or empty tokens)
            start_idx = 0
            end_idx = 0
        
        start_positions.append(start_idx)
        end_positions.append(end_idx)

    # Create dataset
    dataset = Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    
    return dataset


In [None]:
def train_mlm(model, train_dataset, tokenizer):
    """
    Train Masked Language Model
    """
    # Data collator for MLM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=True, 
        mlm_probability=0.15
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./mlm_results",
        num_train_epochs=10,
        per_device_train_batch_size=16,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./mlm_logs'
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
    )
    
    # Train the model
    trainer.train()
    
    return model



In [None]:
def fine_tune_qa(mlm_model, train_dataset):
    """
    Fine-tune Question Answering model using MLM-trained weights as initialization
    """
    # Convert MLM model to QA model
    qa_model = BertForQuestionAnswering.from_pretrained(
        "nlpaueb/legal-bert-base-uncased", 
        state_dict=mlm_model.state_dict()
    )
    
    # Training arguments for QA
    training_args = TrainingArguments(
        output_dir="./qa_results",
        num_train_epochs=10,
        per_device_train_batch_size=16,
        save_steps=10_000,
        save_total_limit=2,
        learning_rate=2e-5, 
        logging_dir='./qa_logs'
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=qa_model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    # Fine-tune the model
    trainer.train()
    
    return qa_model


In [None]:
from transformers import BertTokenizerFast

In [None]:
# Initialize tokenizer and MLM model
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/output-100mb/trained_tokenizer")
mlm_model = BertForMaskedLM.from_pretrained("/kaggle/input/output-100mb/trained_mlm_model")

# Load and prepare articles for MLM
# articles_dir = '/kaggle/input/legal-data-article/d08'

with open('/kaggle/input/legal-data-article/d08/output_93.txt', 'r', encoding='utf-8') as file:
    articles = file.read()

size=int(len(articles)/5)
articles=articles[size:3*size]
chunk_size = 10_000
chunks = [articles[i:i+chunk_size] for i in range(0, len(articles), chunk_size)]

    

In [None]:
len(articles)

In [None]:
len(chunks)

In [None]:
import gc
del articles
gc.collect()

In [None]:
    # Prepare MLM dataset
mlm_dataset = prepare_mlm_dataset(chunks, tokenizer)
del chunks
gc.collect()


In [None]:
    # Train MLM model
trained_mlm_model = train_mlm(mlm_model, mlm_dataset, tokenizer)

In [None]:
trained_mlm_model.save_pretrained("trained_mlm_model_1")

In [None]:
tokenizer.save_pretrained("trained_tokenizer_1")

In [None]:
import torch
from math import exp
from transformers import DataCollatorForLanguageModeling



In [None]:
test_files = ["/kaggle/input/legal-data-article/d08/output_15.txt", "/kaggle/input/legal-data-article/d08/output_25.txt","/kaggle/input/legal-data-article/d08/output_14.txt"]  # Replace with your test file paths

# Initialize results dictionary
results = {"file": [], "mlm_model": [], "trained_mlm_model": []}

In [None]:
import torch
import torch
from math import exp
from transformers import DataCollatorForLanguageModeling



# Ensure that the device is set properly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the models to the selected device (GPU if available, otherwise CPU)
mlm_model.to(device)
trained_mlm_model.to(device)

# You can check if the model is successfully moved to the device like this:
print(f"Model moved to: {device}")


In [None]:
def evaluate_model_on_txt(model, tokenizer, test_file_path, device):
    """
    Evaluates a model on a given test file for MLM task and computes perplexity.
    :param model: MLM model to evaluate
    :param tokenizer: Tokenizer for tokenizing the input
    :param test_file_path: Path to the test file (.txt)
    :param device: Device ('cuda' or 'cpu') where the model should run
    :return: Perplexity of the model on the test file
    """
    model.eval()
    with open(test_file_path, 'r', encoding='utf-8') as file:
        test_text = file.read()

    # Tokenize the text
    tokens = tokenizer(test_text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    
    # Move the tokenized tensors to the specified device (either 'cuda' or 'cpu')
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)

    # Mask tokens for MLM
    labels = input_ids.clone()
    masked_indices = torch.bernoulli(torch.full(labels.shape, 0.15)).bool()  # Mask 15% of tokens
    labels[~masked_indices] = -100  # Only compute loss on masked tokens

    # Compute loss
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss.item()

    # Calculate perplexity
    perplexity = exp(loss)
    return perplexity

# Loop through each test file and calculate perplexity for both models
for test_file in test_files:
    print(f"Evaluating on {test_file}...")
    
    # Pass 'cuda' instead of 'gpu'
    fine_tuned_perplexity = evaluate_model_on_txt(mlm_model, tokenizer, test_file, "cuda")
    pretrained_perplexity = evaluate_model_on_txt(trained_mlm_model, tokenizer, test_file, "cuda")
    
    # Append results to the dictionary
    results["file"].append(test_file)
    results["mlm_model"].append(fine_tuned_perplexity)
    results["trained_mlm_model"].append(pretrained_perplexity)
    
    # Print the perplexity values for each model
    print(f"  mlm_model perplexity: {fine_tuned_perplexity}")
    print(f"  trained_mlm_model perplexity: {pretrained_perplexity}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display results
print("\nComparison Results:")
display(results_df)


In [None]:

    # Prepare QA dataset
    qa_csv_path = '/kaggle/input/ques-ans2/question_answers2.csv'
    qa_dataset = prepare_qa_dataset(qa_csv_path, tokenizer)
    
    # Fine-tune QA model using MLM model weights
    trained_qa_model = fine_tune_qa(trained_mlm_model, qa_dataset)
    
    # Save models





In [None]:
    trained_qa_model.save_pretrained("trained_qa_model_1")
    tokenizer.save_pretrained("trained_tokenizer_1")

In [None]:
def evaluate_model(question):
    """
    Predict the answer for a given question without requiring any explicit context.
    """
    # Use an empty string as the context
    context = ""
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: val.to("cuda") for key, val in inputs.items()}  # Move inputs to GPU

    with torch.no_grad():
        outputs = trained_qa_model(**inputs)

    # Get start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get predicted start and end indices
    start_idx = torch.argmax(start_logits, dim=1).item()
    end_idx = torch.argmax(end_logits, dim=1).item()

    # Decode the predicted answer
    tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
    predicted_answer = tokenizer.decode(tokens, skip_special_tokens=True).strip()

    if not predicted_answer:
        return "No valid answer found."
    return predicted_answer

# Example usage
question = "What business laws"
predicted_answer = evaluate_model(question)
print("Predicted Answer:", predicted_answer)


In [36]:
from transformers import BertTokenizer, BertForMaskedLM

In [37]:
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/final-capstone/trained_tokenizer_1")
mlm_model = BertForMaskedLM.from_pretrained("/kaggle/input/final-capstone/trained_mlm_model_1")

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [37]:
from transformers import BertForQuestionAnswering, BertTokenizer

# Load pre-trained BERT model for QA
model = BertForQuestionAnswering.from_pretrained('/kaggle/input/final-capstone/trained_mlm_model_1')
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/final-capstone/trained_tokenizer_1')

def generate_answer(model, tokenizer, question, context, max_length=200):
    # Encode the question and context
    inputs = tokenizer(
        question, 
        context, 
        return_tensors="pt", 
        truncation=True, 
        max_length=512
    )
    
    # Get the answer
    outputs = model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    # Extract the answer from the context
    answer = tokenizer.decode(inputs["input_ids"][0][start_index:end_index+1])
    return answer

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /kaggle/input/final-capstone/trained_mlm_model_1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
%pip install sentence_transformers

  pid, fd = os.forkpty()


Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [54]:
from sentence_transformers import SentenceTransformer
import numpy as np

def find_most_relevant_contexts(df, question, top_k=3):
    # Load a sentence embedding model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Embed the question
    question_embedding = model.encode(question)
    
    answer_embeddings = model.encode(df['Answer'].tolist())
    
    # Calculate cosine similarities
    similarities = np.dot(answer_embeddings, question_embedding)
    
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    return df.iloc[top_indices]['Answer'].tolist()


def prepare_context(df, question):
    # Find most relevant contexts based on semantic similarity
    contexts = find_most_relevant_contexts(df, question)
    return " ".join(contexts)

In [55]:
def generate_answer(model, tokenizer, question, context, max_length=512):
    # Encode the question and context
    inputs = tokenizer(
        question, 
        context, 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_length
    )
    
    # Get the answer
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
    
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)
    
    if end_index < start_index:
        start_index, end_index = end_index, start_index
    
    answer_ids = inputs["input_ids"][0][start_index:end_index+1]
    answer = tokenizer.decode(answer_ids)
    
    return answer

In [57]:
# Usage
question = qna_data['Question'][0]
context = prepare_context(qna_data, question)
answer = generate_answer(model, tokenizer, question, context)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/384 [00:00<?, ?it/s]

In [58]:
answer

"leveraging the existing brand's reputation. brand extension is when a"

Lets try finetuning

In [86]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForQuestionAnswering, BertTokenizerFast, AdamW
from sklearn.model_selection import train_test_split

class QADataset(Dataset):
    def __init__(self, questions, contexts, answers, tokenizer, max_length=512):
        self.encodings = tokenizer(
            questions, 
            contexts, 
            truncation=True, 
            max_length=max_length, 
            padding=True,
            return_offsets_mapping=True
        )
        
        self.start_positions = []
        self.end_positions = []
        
        for i, (question, context) in enumerate(zip(questions, contexts)):
            answer = answers[i]
            
            # Simple span finding
            try:
                answer_start = context.lower().find(answer.lower())
                if answer_start == -1:
                    self.start_positions.append(0)
                    self.end_positions.append(0)
                    continue
                
                answer_end = answer_start + len(answer)
                
                # Tokenize context to find token positions
                context_tokens = tokenizer.encode(context, add_special_tokens=False)
                answer_tokens = tokenizer.encode(answer, add_special_tokens=False)
                
                # Find token positions
                start_token = len(tokenizer.encode(context[:answer_start], add_special_tokens=False))
                end_token = start_token + len(answer_tokens) - 1
                
                self.start_positions.append(start_token)
                self.end_positions.append(end_token)
            
            except Exception as e:
                print(f"Error processing sample: {e}")
                self.start_positions.append(0)
                self.end_positions.append(0)
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'start_positions': torch.tensor(self.start_positions[idx]),
            'end_positions': torch.tensor(self.end_positions[idx])
        }

def fine_tune_qa_model(
    df,
    epochs=3, 
    batch_size=8, 
    learning_rate=5e-5
):
    # Prepare model and tokenizer
    model = BertForQuestionAnswering.from_pretrained('/kaggle/input/final-capstone/trained_mlm_model_1')
    tokenizer = BertTokenizerFast.from_pretrained('/kaggle/input/final-capstone/trained_tokenizer_1')

    # Split data
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # Create datasets
    train_dataset = QADataset(
        train_df['Question'].tolist(), 
        train_df['Context'].tolist(), 
        train_df['Answer'].tolist(), 
        tokenizer
    )
    val_dataset = QADataset(
        val_df['Question'].tolist(), 
        val_df['Context'].tolist(), 
        val_df['Answer'].tolist(), 
        tokenizer
    )

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Prepare optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Move model to device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Zero grad
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")
    
    # Save the model
    model.save_pretrained('./qa_model')
    tokenizer.save_pretrained('./qa_model')
    
    return model, tokenizer



In [87]:
qna_data = qna_data.head(10)
# Prepare DataFrame
qna_data['Context'] = qna_data.apply(
    lambda row: prepare_context(qna_data, row['Question']), 
    axis=1
)

# Fine-tune the model
finetuned_model, finetuned_tokenizer = fine_tune_qa_model(qna_data)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /kaggle/input/final-capstone/trained_mlm_model_1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 4.769453525543213
Epoch 2/3, Loss: 4.353943347930908
Epoch 3/3, Loss: 4.319149017333984


In [88]:
def generate_answer(model, tokenizer, question, context, max_length=512):
    # Determine the device (CUDA if available, otherwise CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Move the model to the appropriate device
    model.to(device)
    
    # Encode the question and context
    inputs = tokenizer(
        question, 
        context, 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_length
    ).to(device)  # Move inputs to the same device
    
    # Get the answer
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
    
    # Find the best start and end indices
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)
    
    # Ensure end_index is after start_index
    if end_index < start_index:
        start_index, end_index = end_index, start_index
    
    # Extract the answer from the context
    answer_ids = inputs["input_ids"][0][start_index:end_index+1]
    answer = tokenizer.decode(answer_ids)
    
    return answer


In [92]:
# Usage
question = qna_data['Question'][0]
context = prepare_context(qna_data, question)
finetuned_model.to(device)
answer = generate_answer(finetuned_model, finetuned_tokenizer, question, context)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [94]:
answer

"brand extensions. [SEP] brand extensions involve using an established brand to introduce new products or enter new markets, leveraging the existing brand's reputation. 1. establish brand meaning in the minds of the customer2. covert the brand responses to a loyal relationship between the customer and the company3. elicit a customer response brand differentiation is the process of making a brand stand out from its competitors. it's important to attract and retain customers in a crowded marketplace. [SEP]"

In [98]:
context

"Brand extensions involve using an established brand to introduce new products or enter new markets, leveraging the existing brand's reputation. 1. Establish brand meaning in the minds of the customer2. Covert the brand responses to a loyal relationship between the customer and the company3. Elicit a customer response Brand differentiation is the process of making a brand stand out from its competitors. It's important to attract and retain customers in a crowded marketplace."

In [96]:
print(question, '\nAnswer'+answer.split('[SEP]')[1])

Q.1Explain the concept of brand extensions. 
Answer brand extensions involve using an established brand to introduce new products or enter new markets, leveraging the existing brand's reputation. 1. establish brand meaning in the minds of the customer2. covert the brand responses to a loyal relationship between the customer and the company3. elicit a customer response brand differentiation is the process of making a brand stand out from its competitors. it's important to attract and retain customers in a crowded marketplace. 


# ****Trying T5 as bert doesn't do text generation****  

In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Read and prepare chunks
with open('/kaggle/input/legal-data-article/d08/output_93.txt', 'r', encoding='utf-8') as file:
    articles = file.read()

# Process the articles
# size = int(len(articles) / 5)
# articles = articles[2 * size:]  # Taking a subset of the articles
chunk_size = 10_000
chunks = [articles[i:i + chunk_size] for i in range(0, len(articles), chunk_size)]


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
# # Prepare MLM dataset
def prepare_mlm_dataset(chunks, tokenizer, max_length=512):
    encodings = tokenizer(
        chunks,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Create labels
    labels = encodings["input_ids"].clone()
    # Mask some tokens randomly 
    mask_prob = 0.15  
    mask_indices = torch.rand(labels.shape).lt(mask_prob)
    labels[mask_indices] = -100
    
    dataset = Dataset.from_dict({
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels
    })
    return dataset


# Prepare dataset for MLM training
mlm_dataset = prepare_mlm_dataset(chunks, tokenizer)



In [3]:

del articles
del chunks



In [4]:

import gc
gc.collect()

5

In [5]:

# Define MLM training function
def train_mlm(model, train_dataset, tokenizer):
    """
    Train Masked Language Model
    """
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./t5_mlm_results",
        num_train_epochs=10,
        per_device_train_batch_size=16,
        logging_steps=5000,
        save_steps=5000,
        save_total_limit=2,
        logging_dir='./t5_mlm_logs',
        save_strategy="steps",
        save_only_model=True,
        
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer
    )
    
    # Train the model
    trainer.train()
    
    return model

# Train the model on MLM task
trained_mlm_model = train_mlm(model, mlm_dataset, tokenizer)

# Save the trained model and tokenizer
trained_mlm_model.save_pretrained("t5_trained_mlm_model_2")
tokenizer.save_pretrained("t5_trained_tokenizer_2")

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
5000,0.1863
10000,0.0279
15000,0.0161
20000,0.0117
25000,0.0097
30000,0.0085
35000,0.0078


('t5_trained_tokenizer_2/tokenizer_config.json',
 't5_trained_tokenizer_2/special_tokens_map.json',
 't5_trained_tokenizer_2/spiece.model',
 't5_trained_tokenizer_2/added_tokens.json')

In [70]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()


In [13]:
import torch
torch.cuda.empty_cache()
gc.collect()

394

In [14]:
import torch
import gc

del trained_mlm_model  # Delete the model
torch.cuda.empty_cache() 
gc.collect() 



0

In [11]:
from transformers import T5ForConditionalGeneration, T5Tokenizer


In [12]:
import pandas as pd
qna_data = pd.read_csv('/kaggle/input/ques-ans2/question_answers2.csv')

In [13]:
qna_data['Question'].dtype
qna_data['Question'] = qna_data['Question'].astype(str)
qna_data['Answer'] = qna_data['Answer'].astype(str)


In [14]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Tokenizer and model loading
model = T5ForConditionalGeneration.from_pretrained('t5_trained_mlm_model_2')
tokenizer = T5Tokenizer.from_pretrained('t5_trained_tokenizer_2')

def tokenize_data(dataset, tokenizer):
    def tokenize_function(examples):
        model_inputs = tokenizer(examples['Question'], max_length=512, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples['Answer'], max_length=512, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return dataset.map(tokenize_function, batched=True)

# Prepare dataset 
train_dataset = Dataset.from_pandas(qna_data)

# Tokenize the dataset
tokenized_train_dataset = tokenize_data(train_dataset, tokenizer)



Map:   0%|          | 0/12263 [00:00<?, ? examples/s]



In [15]:


# Define training function
def train_t5_model(model, train_dataset, tokenizer):
    # Training configuration
    training_args = TrainingArguments(
        output_dir="./t5_qa_model",
        num_train_epochs=10,
        per_device_train_batch_size=16,
        warmup_steps=100,
        learning_rate=5e-5,
        logging_dir='./logs',
        logging_steps=500,
        save_steps=1000,
        save_total_limit=2,
        save_strategy="steps",
        save_only_model=True,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer
    )
    
    try:
        trainer.train()
    except RuntimeError as e:
        print(f"Training error: {e}")
        print("Trying to reset CUDA memory...")
        torch.cuda.empty_cache()
        raise
    
    return model



# Train the model
trained_model = train_t5_model(model, tokenized_train_dataset, tokenizer)

# Save the model and tokenizer
trained_model.save_pretrained("./t5_qa_finetuned_2")
tokenizer.save_pretrained("./t5_qa_finetuned_2")


  trainer = Trainer(


Step,Training Loss
500,2.0811
1000,0.3214
1500,0.3043
2000,0.2987
2500,0.2966
3000,0.29
3500,0.2905
4000,0.2847
4500,0.2836
5000,0.2822


('./t5_qa_finetuned_2/tokenizer_config.json',
 './t5_qa_finetuned_2/special_tokens_map.json',
 './t5_qa_finetuned_2/spiece.model',
 './t5_qa_finetuned_2/added_tokens.json')

In [16]:
import torch

def generate_answer(model, tokenizer, input_text, max_length=2048):
    # Check if GPU is available and set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Generate the output from the model
    outputs = model.generate(
        inputs["input_ids"], 
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9
    )
    
    # Decode the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Example usage:
question = qna_data['Question'][2]
answer = generate_answer(trained_model, tokenizer, question, 2048)
print('Question: ', question)
print('Answer as per data: ', qna_data['Answer'][2])
print('Answer by model: ', answer)

Question:  Q.3How can social media be used in Brand Management?
Answer as per data:  Social media platforms can be used to engage with customers, build brand awareness, and gather valuable feedback.
Answer by model:  Social media is the most powerful marketing tool to communicate branding and brand behaviors, creating effective campaigns, and optimizing brand exposure.


In [18]:
ques='what are negative effect of social media'
answer=generate_answer(trained_model, tokenizer, ques, 2048)
print(answer)

Negative effect of social media, are positive effect as social workers are gaining social services and online and offline based marketing.


# Trying next word prediciton to get answer

In [51]:
model = T5ForConditionalGeneration.from_pretrained('t5_trained_mlm_model_1')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [55]:
import torch

# Ensure the device is consistent
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model = model.to(device)


In [56]:
import pandas as pd
qna_data = pd.read_csv('/kaggle/input/ques-ans2/question_answers2.csv')

In [57]:
qna_data['Question'][0]

'Q.1Explain the concept of brand extensions.'

In [58]:
from datasets import Dataset
qna_data["formatted"] = qna_data.apply(
    lambda row: f"<Question> {row['Question']} <Answer> {row['Answer']} <END_ANS>", axis=1
)

# Add special tokens
special_tokens = {"additional_special_tokens": ["<Question>", "<Answer>", "<END_ANS>"]}
tokenizer.add_special_tokens(special_tokens)
# Resize the model's embedding layer 
model.resize_token_embeddings(len(tokenizer))


# Convert to a Dataset
dataset = Dataset.from_pandas(qna_data)

In [59]:
dataset

Dataset({
    features: ['Question', 'Answer', 'formatted'],
    num_rows: 12263
})

In [60]:
# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["formatted"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/12263 [00:00<?, ? examples/s]

In [61]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] ='1' 
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [64]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from datasets import Dataset

def train_qa(model, train_dataset, tokenizer):
    # Data collator for sequence-to-sequence tasks
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True
    )

    # Modify training arguments for T5 fine-tuning
    training_args = TrainingArguments(
        output_dir="t5_qa_results",       # Directory to save checkpoints
        num_train_epochs=10,              # Number of epochs
        per_device_train_batch_size=16,   # Batch size
        learning_rate=5e-5,               # Learning rate
        weight_decay=0.01,                # Regularization
        warmup_steps=500,                 # Warm-up steps
        logging_steps=1000,               # Log progress
        save_steps=1000,                  # Save model every 1000 steps
        eval_steps=500,                   # Evaluate every 500 steps
        save_total_limit=2,               # Keep only the latest 2 checkpoints
        logging_dir="./logs",             # Logging directory
        push_to_hub=False
    )

    # Initialize Trainer for T5
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train the model
    try:
        trainer.train()
    except RuntimeError as e:
        print(f"Training error: {e}")
        print("Trying to reset CUDA memory...")
        torch.cuda.empty_cache()
        raise

    return model

# Load pretrained model and tokenizer
checkpoint = "t5-base"  # Change to your model checkpoint if needed
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

# Train the model
trained_model = train_qa(model, tokenized_dataset, tokenizer)

# Save the model and tokenizer
trained_model.save_pretrained("./t5_qa_finetuned_gen")
tokenizer.save_pretrained("./t5_qa_finetuned_gen")


  trainer = Trainer(


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [46]:
def generate_answer(model, tokenizer, question, max_length=200):
    input_text = f"<Question> {question} <Answer>"
    inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=True)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    # Improve generation parameters
    output_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    # Extract the answer part
    answer = output_text
    return answer



In [49]:
# Example
question = "Q.1Explain the concept of brand extensions."
generated_answer = generate_answer(trained_model, tokenizer, question,200)
print(generated_answer)


[CLS] <Question> q. 1explain the concept of brand extensions. <Answer> [SEP] q - 1's -'a'or'+ '.'options'mean 2. options used. 3. xting.. 4. 5. or. + / - or 6. = + +. 7 + or + -. ]. [ or ] 7. ( + ) [ + ] [ - ). 6 5 4 6 6 7 ( ) 7 6 9 9 6 8. 9 8 9 7 5 8 8 7 8 5 7 9 5 5 6 4 4 9 3 7 7 1 1 9 4 5 9 1 4 8 4 7 4 3 3 9 2 6 3 5 2 4 2 3 4 1 7 2 8 3 2 9 13 13 14. 13. 14 1313 14 12 13 12. 11 13 11 12 12 11 14 14 11. 12 sales. 17. 18. 19. 16 16. 15. 94 14 15 11 15 16 - 13 23. 88


In [50]:
def generate_answer(model, tokenizer, question, context="", max_length=200):
    # T5 input formatting
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    # Generation with controlled parameters
    output_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=1,
        do_sample=False,  # Greedy decoding
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode and clean up output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

# Example usage
question = "Explain the concept of brand extensions."
context = "Brand extensions occur when a company uses an existing brand name to introduce new products or categories. This strategy leverages brand equity to attract customers."
generated_answer = generate_answer(model, tokenizer, question, context)
print("Generated Answer:", generated_answer)


Generated Answer: 1980. [unused8] hazard [unused56] [unused2] [unused8] located [unused44].. [unused6] 422je [unused6] [unused5] [unused41] [unused8] hazard brand [unused5]. strategy leverages brand equity to provisions. [unused6] k [unused17]. [unused4] under 982 leverages brand equity to [unused5]. [unused8]s [unused17] [unused59]. [unused6] 422 [unused56] [unused5]..nono [unused6] [unused52]...no [unused6] 2015 [unused5]..no [unused6] k [unused5] [unused11]...... [unused6] [unused6] [unused52].... assembly [unused6] [unused11] [unused9] [unused9]. sense sense \ [unused4] [unused20] [unused5].. sense sense [unused9] [unused9] [unused9] [unused4] [unused4]... [unused4]......... $... with... [unused6] preventing. [unused6] $ [unused44] [unused44] he [unused5]. [unused4] brand with.. with : brand brand name [unused6] introduce new [unused15]
