In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd

In [None]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

In [None]:
def prepare_data(model_name, train_texts, train_labels, val_texts=None, val_labels=None):
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    prepare_val = False if val_texts is None or val_labels is None else True

    def tokenize_data(texts, labels):
    # Ensure texts and labels are lists
        if not isinstance(texts, list):
            texts = [str(texts)]
            labels = [str(labels)]
        else:
            texts = [str(text) for text in texts]
            labels = [str(label) for label in labels]

    # Tokenize texts and labels
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)

        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized


    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None

    return train_dataset, val_dataset, tokenizer

In [None]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='/content/drive/MyDrive/results'):
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False

    if val_dataset is not None:
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=300,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            save_steps=5000,
            save_total_limit=2,
            evaluation_strategy='steps',
            eval_steps=100,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='/content/drive/MyDrive/logs',
            logging_steps=10,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer
        )

    else:
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=300,
            per_device_train_batch_size=1,
            save_steps=5000,
            save_total_limit=2,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='/content/drive/MyDrive/logs',
            logging_steps=10,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            tokenizer=tokenizer
        )

    return trainer

In [None]:
if __name__ == '__main__':
    # Load your modified dataset
    df = pd.read_csv("/content/drive/MyDrive/modified_dataset.csv", nrows=10000)

    # Define train and validation sets
    train_size = int(len(df) * 0.8)
    train_texts, train_labels = df['text'][:train_size], df['headline'][:train_size]
    val_texts, val_labels = df['text'][train_size:], df['headline'][train_size:]

    # Use Pegasus Large model as base for fine-tuning
    model_name = 'google/pegasus-large'
    train_dataset, val_dataset, tokenizer = prepare_data(model_name, train_texts, train_labels, val_texts, val_labels)

    # Prepare and start fine-tuning
    trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset)
    trainer.train()

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
100,2.9476,3.056865
200,1.5694,3.555429
300,0.011,5.151013


In [None]:
# After fine-tuning
output_dir = "/content/drive/MyDrive/fine_tuned_model"
trainer.save_model(output_dir)

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [None]:
pip install datasets
!pip install rouge_score


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from datasets import load_metric
import pandas as pd

# Load the fine-tuned model
fine_tuned_model = PegasusForConditionalGeneration.from_pretrained(output_dir)
tokenizer = PegasusTokenizer.from_pretrained(output_dir)

# Load the validation dataset
val_df = pd.read_csv("/content/drive/MyDrive/modified_dataset.csv", skiprows=range(1, 10000))
val_texts, val_labels = val_df['text'][train_size:], val_df['headline'][train_size:]

# Initialize Rouge metric
rouge_metric = load_metric("rouge")

# Choose a specific index from the validation dataset
index = 0

# Generate summary using the fine-tuned model
input_text_tokens = tokenizer(val_texts.iloc[index], truncation=True, padding=True, return_tensors="pt")
summary_ids = fine_tuned_model.generate(input_text_tokens.input_ids)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Compute Rouge metrics
references = [val_labels.iloc[index]]
predictions = [generated_summary]
rouge_output = rouge_metric.compute(predictions=predictions, references=references)

# Print Rouge metrics
print(f"Rouge-1: {rouge_output['rouge1'].mid.fmeasure}")
print(f"Rouge-2: {rouge_output['rouge2'].mid.fmeasure}")
print(f"Rouge-L: {rouge_output['rougeL'].mid.fmeasure}")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Rouge-1: 0.3125
Rouge-2: 0.09677419354838708
Rouge-L: 0.28125


In [None]:
# To load the fine-tuned model for inference
fine_tuned_model = PegasusForConditionalGeneration.from_pretrained(output_dir)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [None]:
print("Enter the text you want to summarize (press Enter twice to finish):")

user_input_lines = []
while True:
    line = input()
    if line:
        user_input_lines.append(line)
    else:
        # If the
        user enters an empty line, stop taking input
        break

# Combine the lines into a single multiline string
user_input_text = "\n".join(user_input_lines)

# Tokenize the multiline input
input_text_tokens = tokenizer(user_input_text, truncation=True, padding=True, return_tensors="pt")

# Generate summary using the fine-tuned model
summary_ids = fine_tuned_model.generate(input_text_tokens.input_ids)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:", summary)

Enter the text you want to summarize (press Enter twice to finish):
Living a healthy lifestyle is all about choosing to make healthy choices each day. Adding more nutritious foods to your diet, making an effort to exercise throughout the week, and prioritizing your mental health are just a few of the things you can do to begin living a healthier and happier life. We’ve compiled an extensive list of tips and advice from professionals about what you can do to improve your physical, mental, and emotional health. By the end of this article, you’ll be ready to start changing your life for the better.
Come up with specific, actionable goals to improve your health. When you have specific goals you want to achieve, you’ll be more motivated to implement healthy changes in your life. Identify what parts of your lifestyle you want to improve (physical fitness, food, mental health, etc.) and set SMART goals that are specific, measurable, attainable, relevant, and time-bound to better guarantee you

In [None]:
# Save generated summary to a text file
with open("generated_summary.txt", "w") as file:
    file.write(summary)

print("Generated Summary saved to generated_summary.txt")

Generated Summary saved to generated_summary.txt


In [None]:
import shutil

# Path to the folder you want to zip
folder_path = '/content/sample_data'

# Path for the zip file
zip_file_path = '/content/sample_data'

# Zip the folder
shutil.make_archive(zip_file_path, 'zip', folder_path)

'/content/sample_data.zip'

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
summary = "Adding more nutritious foods to your diet, making an effort to exercise throughout the week, and prioritizing your mental health are just a few of the things you can do to begin living a healthier and happier life. Identify what parts of your lifestyle you want to improve (physical fitness, food, mental health, etc.) and set SMART goals that are specific, measurable, attainable, relevant, and time-bound to better guarantee your success. Vitamin C and D supplements are essential to stay hydrated (3.1 and 4), but vitamins E and K are also important to your overall health.[5] Keep your water intake as low as possible."

In [3]:
user_preference = input("What is your preferred learning style? (audio/visual): ").lower()

if user_preference == "audio":
    # Generate audio summary
    # Install gTTS library if not already installed
    !pip install gTTS
    # Import the gTTS module
    from gtts import gTTS
    import IPython.display as ipd
    from IPython.display import display

    # Create gTTS object
    tts = gTTS(summary)

    # Save the audio file
    tts.save("output_summary.mp3")

    # Display the audio file
    display(ipd.Audio("output_summary.mp3"))

elif user_preference == "visual":
    # Generate visual summary
    # Keyword extraction
    !pip install yake
    import matplotlib.pyplot as plt
    import yake

    # Initializing the YAKE instance
    yake_kw = yake.KeywordExtractor()

    # Extracting keywords
    KeyWords = yake_kw.extract_keywords(summary)

    # Displaying the keywords
    print(KeyWords)

    # Extracting keywords
    keywords = [kw for kw, _ in KeyWords]
    prompt = " ".join(keywords)
    prompt1 = "nutritious foods, exercise, physical activity, mental health, vitamins or supplements, hydration"

    # Image generation using stable-diffusion-2
    !pip install --upgrade diffusers transformers -q
    from pathlib import Path
    import tqdm
    import torch
    import pandas as pd
    import numpy as np
    from diffusers import StableDiffusionPipeline
    from transformers import pipeline, set_seed

    class CFG:
        device = "cuda"
        seed = 42
        generator = torch.Generator(device).manual_seed(seed)
        image_gen_steps = 35
        image_gen_model_id = "stabilityai/stable-diffusion-2"
        image_gen_size = (400,400)
        image_gen_guidance_scale = 9
        prompt_gen_model_id = "gpt2"
        prompt_dataset_size = 6
        prompt_max_length = 12

    image_gen_model = StableDiffusionPipeline.from_pretrained(
        CFG.image_gen_model_id, torch_dtype=torch.float16,
        revision="fp16", use_auth_token='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', guidance_scale=9
    )
    image_gen_model = image_gen_model.to(CFG.device)

    def generate_image(prompt, model):
        image = model(
            prompt, num_inference_steps=CFG.image_gen_steps,
            generator=CFG.generator,
            guidance_scale=CFG.image_gen_guidance_scale
        ).images[0]
        image = image.resize(CFG.image_gen_size)
        return image, summary

    generate_image(prompt, image_gen_model)

else:
    print("Invalid input. Please choose either 'audio' or 'visual'.")

What is your preferred learning style? (audio/visual): audio
