In [1]:
!pip install faker
!pip install contractions 
!pip install simalign 
!pip install ipywidgets
!pip install transformers[torch] datasets



In [2]:
import pandas as pd
from faker import Faker
import random

fake = Faker()

def generate_policy_entry():
    return {
        "Policy_ID": fake.uuid4(),
        "Policy_Text_EN": fake.text(max_nb_chars=500),
        "Policy_Text_FR": fake.text(max_nb_chars=500),
        "Policy_Text_ES": fake.text(max_nb_chars=500),
        "Summarized_Text": fake.text(max_nb_chars=100)
    }

data = [generate_policy_entry() for _ in range(500)]

In [3]:
df = pd.DataFrame(data)
df.to_csv(r"D:\Data science\Projects\Final Project\Csv\multilingual_insurance_policy_dataset.csv", index=False)

In [4]:
df.head()

Unnamed: 0,Policy_ID,Policy_Text_EN,Policy_Text_FR,Policy_Text_ES,Summarized_Text
0,58124751-fe4e-4174-8ab6-50112f1a9cc2,Political meeting garden. Material sense littl...,Begin move risk situation film ahead. Civil co...,Standard building husband sign police bring. R...,Official sense participant return evening. Rem...
1,44ba98db-1289-4b03-b6fe-18d3dfe1db17,However candidate issue career out.\nPolice po...,Benefit ago now good. Team prove position unde...,Best present guess business. Well station evid...,Quality page share truth international. Leave ...
2,a9736519-4549-4a48-a8cf-335d65cb6cae,Full authority industry beyond. Project campai...,Something one fly he believe article few. Wear...,After learn result meet against meeting. Word ...,Popular miss magazine radio trip. Least send l...
3,6cb7c904-f8ae-4d9b-a8ed-9e0ebb01dec6,Agent coach shoulder. Per state leader then en...,Strategy author environmental level. Keep walk...,Side the better kitchen assume certain until. ...,Car parent southern follow. President try meet...
4,f68bde33-328e-4d67-a53a-1689dd4c4160,Congress five eat structure my lose scene. Mov...,This under nothing feel better believe. Role b...,Country stuff fill technology seem effort abou...,Option answer within. Its night class admit bu...


In [5]:
import re
import contractions

In [6]:
def normalize_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
df = pd.read_csv(r"D:\Data science\Projects\Final Project\Csv\multilingual_insurance_policy_dataset.csv")
df['Policy_Text_EN'] = df['Policy_Text_EN'].apply(normalize_text)
df['Policy_Text_FR'] = df['Policy_Text_FR'].apply(normalize_text)
df['Policy_Text_ES'] = df['Policy_Text_ES'].apply(normalize_text)
df['Summarized_Text'] = df['Summarized_Text'].apply(normalize_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_en = TfidfVectorizer(max_features=500)
tfidf_fr = TfidfVectorizer(max_features=500)
tfidf_es = TfidfVectorizer(max_features=500)

tfidf_features_en = tfidf_en.fit_transform(df['Policy_Text_EN'])
tfidf_features_fr = tfidf_fr.fit_transform(df['Policy_Text_FR'])
tfidf_features_es = tfidf_es.fit_transform(df['Policy_Text_ES'])

In [9]:
df_policy_processed = pd.concat([
    df[['Policy_ID', 'Summarized_Text']],
    pd.DataFrame(tfidf_features_en.toarray(), columns=[f'EN_{x}' for x in tfidf_en.get_feature_names_out()]),
    pd.DataFrame(tfidf_features_fr.toarray(), columns=[f'FR_{x}' for x in tfidf_fr.get_feature_names_out()]),
    pd.DataFrame(tfidf_features_es.toarray(), columns=[f'ES_{x}' for x in tfidf_es.get_feature_names_out()])
], axis=1)

In [10]:
df.to_csv(r"D:\Data science\Projects\Final Project\Csv\Preprocessed Csv\Preprocessed_multilingual_insurance_policy_dataset.csv", index=False)
df.head()

Unnamed: 0,Policy_ID,Policy_Text_EN,Policy_Text_FR,Policy_Text_ES,Summarized_Text
0,58124751-fe4e-4174-8ab6-50112f1a9cc2,political meeting garden material sense little...,begin move risk situation film ahead civil cou...,standard building husband sign police bring re...,official sense participant return evening rema...
1,44ba98db-1289-4b03-b6fe-18d3dfe1db17,however candidate issue career out police poss...,benefit ago now good team prove position under...,best present guess business well station evide...,quality page share truth international leave a...
2,a9736519-4549-4a48-a8cf-335d65cb6cae,full authority industry beyond project campaig...,something one fly he believe article few wear ...,after learn result meet against meeting word t...,popular miss magazine radio trip least send lo...
3,6cb7c904-f8ae-4d9b-a8ed-9e0ebb01dec6,agent coach shoulder per state leader then env...,strategy author environmental level keep walk ...,side the better kitchen assume certain until f...,car parent southern follow president try meet ...
4,f68bde33-328e-4d67-a53a-1689dd4c4160,congress five eat structure my lose scene movi...,this under nothing feel better believe role bl...,country stuff fill technology seem effort abou...,option answer within its night class admit bui...


In [11]:
from transformers import MBartTokenizer, MBartForConditionalGeneration
from datasets import Dataset

model_name = "sshleifer/tiny-mbart"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [12]:
def preprocess_function(examples):
    inputs = tokenizer(examples["Policy_Text_EN"], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples["Summarized_Text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

In [13]:
dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mbart_policy_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets
)

In [15]:
trainer.train()

Step,Training Loss
10,12.4297
20,12.4294
30,12.4296
40,12.4295
50,12.4292
60,12.429
70,12.4287
80,12.4283
90,12.428
100,12.4276


TrainOutput(global_step=250, training_loss=12.426639678955079, metrics={'train_runtime': 176.1829, 'train_samples_per_second': 2.838, 'train_steps_per_second': 1.419, 'total_flos': 442368000.0, 'train_loss': 12.426639678955079, 'epoch': 1.0})

In [None]:
import pickle
with open(r"D:\Data science\Projects\Final Project\Pkl\Multilingual_Insurance_model.pkl", "wb") as f:
    pickle.dump(model, f)
print("✅ Fine-tuned model saved as Multilingual_Insurance_model.pkl")