# **Data Preparation**


In [9]:
# Load dataset (replace with your dataset path)
import pandas as pd
dataset_path = "../Data/sensible_complex_simplified_dataset.csv"
data = pd.read_csv(dataset_path)

# Display the data
data.head()

Unnamed: 0,Complex_Text,Simplified_Text
0,A sedentary lifestyle combined with poor dieta...,Sitting too much and eating poorly increase di...
1,Blockchain technology provides a secure way to...,Blockchain secures digital transactions withou...
2,"With the advent of 5G networks, the speed and ...",5G networks make the internet faster and more ...
3,Online learning platforms provide opportunitie...,Online platforms give remote learners access t...
4,"With the advent of 5G networks, the speed and ...",5G networks make the internet faster and more ...


**Data Preprocessing**


In [10]:
import re
from nltk.tokenize import word_tokenize
import nltk

# Download the 'punkt_tab' data
nltk.download('punkt_tab')  # This line downloads the necessary data
nltk.download('punkt')


def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()


data['Complex_Text'] = data['Complex_Text'].apply(clean_text)
data['Simplified_Text'] = data['Simplified_Text'].apply(clean_text)

# Tokenize
data['Complex_Text_Tokens'] = data['Complex_Text'].apply(word_tokenize)
data['Simplified_Text_Tokens'] = data['Simplified_Text'].apply(word_tokenize)

data.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dipan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dipan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Complex_Text,Simplified_Text,Complex_Text_Tokens,Simplified_Text_Tokens
0,a sedentary lifestyle combined with poor dieta...,sitting too much and eating poorly increase di...,"[a, sedentary, lifestyle, combined, with, poor...","[sitting, too, much, and, eating, poorly, incr..."
1,blockchain technology provides a secure way to...,blockchain secures digital transactions withou...,"[blockchain, technology, provides, a, secure, ...","[blockchain, secures, digital, transactions, w..."
2,with the advent of 5g networks the speed and r...,5g networks make the internet faster and more ...,"[with, the, advent, of, 5g, networks, the, spe...","[5g, networks, make, the, internet, faster, an..."
3,online learning platforms provide opportunitie...,online platforms give remote learners access t...,"[online, learning, platforms, provide, opportu...","[online, platforms, give, remote, learners, ac..."
4,with the advent of 5g networks the speed and r...,5g networks make the internet faster and more ...,"[with, the, advent, of, 5g, networks, the, spe...","[5g, networks, make, the, internet, faster, an..."


# **Model Building**


In [11]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load T5 model and tokenizer
model_name = "t5-small"  # You can use "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Example input
example = "summarize: The economic impact of inflation on a nation's GDP is multifaceted and significant."
inputs = tokenizer.encode(example, return_tensors="pt",
                          max_length=512, truncation=True)

# Generate simplified text
outputs = model.generate(inputs, max_length=50,
                         num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

the economic impact of inflation on a nation's GDP is multifaceted.


**Model Training**


**Fine-Tune the Model**


In [12]:
from datasets import Dataset

# Convert DataFrame to Hugging Face Dataset
train_data = Dataset.from_pandas(data[['Complex_Text', 'Simplified_Text']])

# Tokenize dataset
def preprocess_function(examples):
    inputs = ["highlight: " + text for text in examples["Complex_Text"]]
    targets = [text for text in examples["Simplified_Text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=512, truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs


tokenized_datasets = train_data.map(preprocess_function, batched=True)

Map: 100%|██████████| 10033/10033 [00:01<00:00, 5237.16 examples/s]


In [13]:
from transformers import Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Split your dataset (90% training, 10% validation)
train_df, eval_df = train_test_split(data, test_size=0.1)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)


def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["Complex_Text"]]
    targets = [text for text in examples["Simplified_Text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=512, truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs


# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    logging_dir="./logs",
)

Map: 100%|██████████| 9029/9029 [00:01<00:00, 5200.45 examples/s]
Map: 100%|██████████| 1004/1004 [00:00<00:00, 5219.15 examples/s]


**Train Model**


In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

# Before creating the Trainer, define a data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,  # Pass the model to the data collator
    padding=True,  # Enable padding
    return_tensors="pt",  # Specify the return type as PyTorch tensors
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,  # Use the data collator
)
trainer.train()

**Model Evaluation**


In [None]:
from evaluate import load

# Load BLEU metric
bleu_metric = load("bleu")

# Generate predictions


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = bleu_metric.compute(predictions=decoded_preds, references=[
                                 [l] for l in decoded_labels])
    return {"bleu": result["bleu"]}


# Evaluate
results = trainer.evaluate()
print(results)

Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<?, ?B/s]
Downloading extra modules: 4.07kB [00:00, 3.39MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<?, ?B/s]


{'eval_loss': 0.007204011548310518, 'eval_runtime': 13.0655, 'eval_samples_per_second': 76.844, 'eval_steps_per_second': 9.644, 'epoch': 3.0}


# **Deploy as an App**


In [None]:
model.save_pretrained("./summarization_model")
tokenizer.save_pretrained("./summarization_model")

('./summarization_model\\tokenizer_config.json',
 './summarization_model\\special_tokens_map.json',
 './summarization_model\\spiece.model',
 './summarization_model\\added_tokens.json')

**Output**


In [1]:
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("summarization_model")
tokenizer = T5Tokenizer.from_pretrained("summarization_model")

# Define the prediction function


def summarize_text(input_text):
    inputs = tokenizer.encode(
        f"summarize: {input_text}", return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=50,
                             num_beams=4, early_stopping=True)
    Simplified_Text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return Simplified_Text


# Gradio Interface
interface = gr.Interface(
    fn=summarize_text,
    inputs="text",
    outputs="text",
    title="Text summarization_model",
    description="Enter text and receive a summarized version."
)

# Launch the app
interface.launch()

  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


