# AI Assignment from Vijayi WFH Technologies Pvt Ltd 

## Task 2 - Data prep & model fine-tuning

In [1]:
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import re




In [None]:
dataset = load_dataset("Abirate/english_quotes")
df = dataset['train'].to_pandas()

In [2]:
# Data preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

In [3]:
# Applying 
df['processed_quote'] = df['quote'].apply(preprocess_text)
df['tags'] = df['tags'].apply(lambda x: [tag.lower() for tag in x])
df = df.dropna(subset=['quote', 'author', 'tags']).reset_index(drop=True)

In [4]:
# Preparing training data
train_examples = []
for _, row in df.iterrows():
    # Create query-answer pairs
    tags_str = ', '.join(row['tags'])
    query = f"Quotes about {tags_str} by {row['author']}"
    train_examples.append(InputExample(
        texts=[query, row['processed_quote']]
    ))

In [5]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# DataLoader setup
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tuning
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    output_path='./fine_tuned_model',
    show_progress_bar=True
)

print("Model fine-tuning completed!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss


Model fine-tuning completed!


In [6]:
model.save('C:/Users/HP/Downloads/Task 2/fine_tuned_model')

In [7]:
df.to_csv('processed_quotes.csv', index=False)