## **Import Libraries**

In [3]:
import pandas as pd
from spacy.lang.en import English
from tqdm import tqdm
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import zipfile
import os
from IPython.display import FileLink  # For Jupyter environments

# **Custom Dataset class for grammar correction**

In [4]:
# Custom Dataset class for grammar correction
class GrammarCorrectionDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_text = "fix grammar: " + str(self.inputs[idx])
        target_text = str(self.targets[idx])
        
        # Tokenize inputs
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        # Tokenize targets
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

In [5]:
# Load dataset
n_rows = 10000
df = pd.read_csv('/kaggle/input/c4200m/C4_200M.tsv-00007-of-00010', sep='\t', nrows=n_rows)
df.columns = ["input", "target"]

In [6]:
# Display sample
print("Sample input:", df["input"][150])
print("Sample target:", df["target"][150])


Sample input: Fiber Optic Cable Cleaning Kits The complete solution for precision end face - Fiber optic cable cleaning.
Sample target: Fiber Optic Cable Cleaning Kits The complete solution for precision end-face fiber optic cable cleaning.


# **Initialize tokenizer and model**

In [7]:
# Initialize tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

# **Prepare dataset for fine-tuning**

In [8]:
# Prepare dataset for fine-tuning
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = GrammarCorrectionDataset(
    inputs=train_df["input"].tolist(),
    targets=train_df["target"].tolist(),
    tokenizer=tokenizer
)

val_dataset = GrammarCorrectionDataset(
    inputs=val_df["input"].tolist(),
    targets=val_df["target"].tolist(),
    tokenizer=tokenizer
)

In [9]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# **Training setup**

In [10]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 1

In [11]:
# Function to calculate cosine similarity
def calculate_cosine_similarity(predicted, target, tokenizer):
    pred_emb = model.get_encoder()(input_ids=predicted.unsqueeze(0).to(device)).last_hidden_state.mean(dim=1).detach().cpu().numpy()
    target_emb = model.get_encoder()(input_ids=target.unsqueeze(0).to(device)).last_hidden_state.mean(dim=1).detach().cpu().numpy()
    return cosine_similarity(target_emb, pred_emb)


In [12]:
# Lists to store metrics for plotting
train_losses, val_losses = [], []
train_accs, val_accs = [], []
train_precisions, val_precisions = [], []
train_recalls, val_recalls = [], []
train_f1s, val_f1s = [], []

# Fine-tuning loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    total_train_acc = 0
    all_preds, all_labels = [], []
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        train_loss += loss.item()
        
        # Calculate accuracy
        preds = torch.argmax(outputs.logits, dim=-1)
        total_train_acc += (preds == labels).sum().item() / torch.numel(labels)
        
        all_preds.extend(preds.cpu().numpy().flatten())
        all_labels.extend(labels.cpu().numpy().flatten())
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
    
    avg_train_loss = train_loss / len(train_loader)
    avg_train_acc = total_train_acc / len(train_loader)
    train_precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    train_recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    train_f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    
    train_losses.append(avg_train_loss)
    train_accs.append(avg_train_acc)
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    print(f"Epoch {epoch+1} Training Loss: {avg_train_loss:.4f}")
    print(f"Epoch {epoch+1} Training Accuracy: {avg_train_acc:.4f}")
    print(f"Epoch {epoch+1} Training Precision: {train_precision:.4f}")
    print(f"Epoch {epoch+1} Training Recall: {train_recall:.4f}")
    print(f"Epoch {epoch+1} Training F1: {train_f1:.4f}")
    
    # Validation
    model.eval()
    val_loss = 0
    total_val_acc = 0
    val_similarities = []
    all_val_preds, all_val_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            val_loss += outputs.loss.item()
            
            # Calculate accuracy
            preds = torch.argmax(outputs.logits, dim=-1)
            total_val_acc += (preds == labels).sum().item() / torch.numel(labels)
            
            all_val_preds.extend(preds.cpu().numpy().flatten())
            all_val_labels.extend(labels.cpu().numpy().flatten())
            
            # Calculate cosine similarity for each sample in batch
            for i in range(input_ids.size(0)):
                pred_ids = model.generate(input_ids[i].unsqueeze(0), max_length=128, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
                similarity = calculate_cosine_similarity(pred_ids[0], labels[i].unsqueeze(0), tokenizer)
                val_similarities.append(similarity)
    
    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc = total_val_acc / len(val_loader)
    avg_val_similarity = np.mean(val_similarities) if val_similarities else 0
    val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
    val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
    val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
    
    val_losses.append(avg_val_loss)
    val_accs.append(avg_val_acc)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")
    print(f"Epoch {epoch+1} Validation Accuracy: {avg_val_acc:.4f}")
    print(f"Epoch {epoch+1} Validation Precision: {val_precision:.4f}")
    print(f"Epoch {epoch+1} Validation Recall: {val_recall:.4f}")
    print(f"Epoch {epoch+1} Validation F1: {val_f1:.4f}")
    print(f"Epoch {epoch+1} Validation Cosine Similarity: {avg_val_similarity:.4f}")


Training Epoch 1:   0%|          | 0/1125 [00:00<?, ?it/s][A
Training Epoch 1:   0%|          | 1/1125 [00:01<21:30,  1.15s/it][A
Training Epoch 1:   0%|          | 2/1125 [00:01<11:50,  1.58it/s][A
Training Epoch 1:   0%|          | 3/1125 [00:01<10:51,  1.72it/s][A
Training Epoch 1:   0%|          | 4/1125 [00:02<10:24,  1.80it/s][A
Training Epoch 1:   0%|          | 5/1125 [00:02<10:08,  1.84it/s][A
Training Epoch 1:   1%|          | 6/1125 [00:03<09:59,  1.87it/s][A
Training Epoch 1:   1%|          | 7/1125 [00:04<09:52,  1.89it/s][A
Training Epoch 1:   1%|          | 8/1125 [00:04<09:48,  1.90it/s][A
Training Epoch 1:   1%|          | 9/1125 [00:05<09:45,  1.91it/s][A
Training Epoch 1:   1%|          | 10/1125 [00:05<09:43,  1.91it/s][A
Training Epoch 1:   1%|          | 11/1125 [00:06<09:42,  1.91it/s][A
Training Epoch 1:   1%|          | 12/1125 [00:06<09:41,  1.92it/s][A
Training Epoch 1:   1%|          | 13/1125 [00:07<09:40,  1.92it/s][A
Training Epoch 1:   1%|

Epoch 1 Training Loss: 0.8219
Epoch 1 Training Accuracy: 0.9125
Epoch 1 Training Precision: 0.9547
Epoch 1 Training Recall: 0.9125
Epoch 1 Training F1: 0.9297


Validation Epoch 1: 100%|██████████| 125/125 [06:51<00:00,  3.29s/it]


Epoch 1 Validation Loss: 0.1665
Epoch 1 Validation Accuracy: 0.9634
Epoch 1 Validation Precision: 0.9657
Epoch 1 Validation Recall: 0.9634
Epoch 1 Validation F1: 0.9617
Epoch 1 Validation Cosine Similarity: -0.1715


In [13]:
# Save fine-tuned model
model.save_pretrained("./fine_tuned_bart")
tokenizer.save_pretrained("./fine_tuned_bart")



('./fine_tuned_bart/tokenizer_config.json',
 './fine_tuned_bart/special_tokens_map.json',
 './fine_tuned_bart/vocab.json',
 './fine_tuned_bart/merges.txt',
 './fine_tuned_bart/added_tokens.json')

In [14]:
# Create a zip file of the fine-tuned model directory
zip_filename = "fine_tuned_bart.zip"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk("./fine_tuned_bart"):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, start="./fine_tuned_bart")
            zipf.write(file_path, os.path.join("fine_tuned_bart", arcname))

print(f"Model and tokenizer saved and archived as {zip_filename}")

# Provide download link for Jupyter environments (e.g., Kaggle, Colab)
try:
    display(FileLink(zip_filename))
except NameError:
    print(f"Please download the file '{zip_filename}' from the current working directory.")

Model and tokenizer saved and archived as fine_tuned_bart.zip


# **Test the fine-tuned model**

In [15]:
model.eval()
test_indices = [550, 123, 789, 42, 987]
print("\nTest Examples:")
for i, idx in enumerate(test_indices, 1):
    text = "fix grammar: " + df["input"][idx]
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"\nExample {i}:")
    print(f"Index: {idx}")
    print("Input:", df["input"][idx])
    print("Corrected Output:", corrected)
    print("Target:", df["target"][idx])
    print("-"*80)


Test Examples:

Example 1:
Index: 550
Input: Zapier, the serbice which helps to sync data between web apps through trigger and actions has announced the launch of developer platform.
Corrected Output: Zapier, the service which helps to sync data between web apps through triggers and actions, has announced the launch of its developer platform.
Target: Zapier, the service that helps sync data between web apps through trigger and actions has announced the launch of its developer platform.
--------------------------------------------------------------------------------

Example 2:
Index: 123
Input: Homeland Security Secretary Kirstjen Nielsen said she works with governors of southwest border states to develop agreemenes on where and how many Guardsmen will be deployed.
Corrected Output: Homeland Security Secretary Kirstjen Nielsen said she works with governors of southwest border states to develop agreements on where and how many Guardsmen will be deployed.
Target: Homeland Security Secre