In [27]:
import pandas as pd

# Load the datasets
train_dataset = pd.read_csv('train.csv')
val_dataset = pd.read_csv('validation.csv')
test_dataset = pd.read_csv('test.csv')

# Display the first few rows of the datasets to understand their structure
print(train_dataset.head())
print(val_dataset.head())
print(test_dataset.head())


                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   

                                          highlights  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  
2  Craig Eccleston-Todd, 27, had drunk at least t...  
3  Nina dos Santos says Europe must be ready to a...  
4  Fleetwood top of League One after 2-0 win at S...  
                                         id  \
0  

In [29]:
#Let's clean the article and highlights columns by removing unnecessary characters, converting to lowercase, etc.
import re

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply cleaning on the article and highlights columns
train_dataset['article'] = train_dataset['article'].apply(clean_text)
train_dataset['highlights'] = train_dataset['highlights'].apply(clean_text)

val_dataset['article'] = val_dataset['article'].apply(clean_text)
val_dataset['highlights'] = val_dataset['highlights'].apply(clean_text)

test_dataset['article'] = test_dataset['article'].apply(clean_text)
test_dataset['highlights'] = test_dataset['highlights'].apply(clean_text)


In [31]:
#Next, we'll segment the sentences in the articles, although this step may be optional depending on the dataset and model requirements.
import nltk
nltk.download('punkt')

def segment_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return ' '.join(sentences)

# Apply sentence segmentation on the dataset
train_dataset['article'] = train_dataset['article'].apply(segment_sentences)
val_dataset['article'] = val_dataset['article'].apply(segment_sentences)
test_dataset['article'] = test_dataset['article'].apply(segment_sentences)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
#handling outliars
# Remove articles that are too short or too long
def filter_outliers(text, min_len=50, max_len=1000):
    if len(text.split()) < min_len or len(text.split()) > max_len:
        return False
    return True

# Filter the datasets
train_dataset = train_dataset[train_dataset['article'].apply(filter_outliers)]
val_dataset = val_dataset[val_dataset['article'].apply(filter_outliers)]
test_dataset = test_dataset[test_dataset['article'].apply(filter_outliers)]

In [5]:
from datasets import load_from_disk

# Load the tokenized datasets
train_dataset = load_from_disk('data/train_tokenized')
val_dataset = load_from_disk('data/val_tokenized')
test_dataset = load_from_disk('data/test_tokenized')

print("Tokenized datasets loaded successfully!")


Tokenized datasets loaded successfully!


In [9]:
from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments, PegasusTokenizer
from datasets import load_from_disk

# Load the tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

def tokenize_data(example):
    inputs = tokenizer(
        example['article'],
        padding='max_length',  # Pad to the maximum length
        truncation=True,       # Truncate sequences to the max length
        max_length=512         # Set the maximum length (adjust as needed)
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['highlights'],
            padding='max_length',
            truncation=True,
            max_length=128       # Adjust as needed for the target summary length
        )
    
    inputs['labels'] = labels['input_ids']
    return inputs

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_data, batched=True, batch_size=32)
val_dataset = val_dataset.map(tokenize_data, batched=True, batch_size=32)
test_dataset = test_dataset.map(tokenize_data, batched=True, batch_size=32)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/241457 [00:00<?, ? examples/s]



Map:   0%|          | 0/11199 [00:00<?, ? examples/s]

Map:   0%|          | 0/9567 [00:00<?, ? examples/s]

In [1]:
# Load the processed datasets
from datasets import load_from_disk
train_dataset = load_from_disk('./final_datasets/train_dataset')
val_dataset = load_from_disk('./final_datasets/val_dataset')
test_dataset = load_from_disk('./final_datasets/test_dataset')

print("Datasets loaded successfully!")


Datasets loaded successfully!


In [15]:
train_dataset = train_dataset.map(tokenize_data, batched=True, batch_size=32)
val_dataset = val_dataset.map(tokenize_data, batched=True, batch_size=32)
test_dataset = test_dataset.map(tokenize_data, batched=True, batch_size=32)


Map:   0%|          | 0/241457 [00:00<?, ? examples/s]

Map:   0%|          | 0/11199 [00:00<?, ? examples/s]

Map:   0%|          | 0/9567 [00:00<?, ? examples/s]

In [3]:
print(train_dataset.column_names)


['id', 'article', 'highlights', '__index_level_0__', 'input_ids', 'attention_mask']


In [19]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments, PegasusTokenizer

# Load the tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Updated batch size
    per_device_eval_batch_size=16,   # Updated batch size
    num_train_epochs=2,              # Updated number of epochs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Trainer initialized!")
trainer.train()
print("Training complete!")

# Save the model and tokenizer
model.save_pretrained('./results/pegasus-summarizer')
tokenizer.save_pretrained('./results/pegasus-summarizer')

# Evaluate the model on the test dataset
eval_results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(f"Test Loss: {eval_results['eval_loss']}")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer initialized!


Epoch,Training Loss,Validation Loss


In [7]:
# Sample 700 examples from each dataset
train_sample = train_dataset.shuffle(seed=42).select(range(700))
val_sample = val_dataset.shuffle(seed=42).select(range(700))
test_sample = test_dataset.shuffle(seed=42).select(range(700))

# Now use these samples for training and evaluation


In [5]:
import datasets

def preprocess_data(example):
    # Tokenize the input text (article) and the target text (highlights)
    inputs = tokenizer(example['article'], max_length=512, padding='max_length', truncation=True)
    labels = tokenizer(example['highlights'], max_length=128, padding='max_length', truncation=True)

    # Set the labels (decoder input ids)
    inputs['labels'] = labels['input_ids']

    return inputs

# Apply the preprocessing to the datasets
train_dataset = train_dataset.map(preprocess_data, batched=True, batch_size=32)
val_dataset = val_dataset.map(preprocess_data, batched=True, batch_size=32)
test_dataset = test_dataset.map(preprocess_data, batched=True, batch_size=32)

# Save the preprocessed datasets
train_dataset.save_to_disk('preprocessed_train_dataset')
val_dataset.save_to_disk('preprocessed_val_dataset')
test_dataset.save_to_disk('preprocessed_test_dataset')

print("Datasets have been tokenized, preprocessed, and saved.")


Map:   0%|          | 0/241457 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [1]:
from datasets import load_from_disk

# Load the preprocessed datasets
train_dataset = load_from_disk('preprocessed_train_dataset')
val_dataset = load_from_disk('preprocessed_val_dataset')
test_dataset = load_from_disk('preprocessed_test_dataset')

# Reduce the size of the datasets to 1000 samples
train_sample = train_dataset.select(range(1000))
val_sample = val_dataset.select(range(100))
test_sample = test_dataset.select(range(100))

# Now you can use these smaller datasets for training


In [3]:
from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments

# Load the Pegasus model
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sample,
    eval_dataset=val_sample,
)

# Start the training process
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./results/pegasus-small')


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,7.9798,6.77064
2,7.5948,6.676665


Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


In [13]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Specify the model path if you saved the model earlier
model_path = './results/pegasus-small'

# Reload the tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model = PegasusForConditionalGeneration.from_pretrained(model_path)


In [15]:
# Specify the path to save the model and tokenizer
save_directory = './results/pegasus-small'

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the model
model.save_pretrained(save_directory)

print("Model and tokenizer saved successfully!")


Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


Model and tokenizer saved successfully!


In [17]:
# Example text to summarize
text_to_summarize = """
Your input text goes here. This should be a long paragraph or a couple of paragraphs that you want to summarize.
"""

# Tokenize the input text
inputs = tokenizer(text_to_summarize, max_length=512, return_tensors='pt', truncation=True)

print("Input text tokenized successfully!")


Input text tokenized successfully!


In [19]:
# Generate the summary
summary_ids = model.generate(
    inputs['input_ids'], 
    max_length=64, 
    num_beams=8, 
    length_penalty=0.6, 
    early_stopping=True
)

# Decode the generated summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary generated successfully!")
print("\nSummary:\n", summary)


Summary generated successfully!

Summary:
 Do you have a short paragraph or two of paragraphs you want to include in this story?


In [23]:
summary_ids = model.generate(
    inputs['input_ids'], 
    max_length=128,  # Increase max length for more detailed summaries
    num_beams=5,     # Experiment with different numbers of beams
    length_penalty=1.0,  # Adjust length penalty to control summary length
    early_stopping=True
)


In [27]:
import gradio as gr
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the tokenizer and model
model_path = './results/pegasus-small'
tokenizer = PegasusTokenizer.from_pretrained(model_path)
model = PegasusForConditionalGeneration.from_pretrained(model_path)

def summarize(text):
    inputs = tokenizer(text, max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=150,
        num_beams=10,
        length_penalty=1.0,
        early_stopping=True,
        temperature=0.7
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

iface = gr.Interface(
    fn=summarize,
    inputs="text",
    outputs="text",
    title="Enhanced Pegasus Text Summarizer",
    description="Enter text to generate a summary using the fine-tuned Pegasus model with improved parameters."
)

iface.launch()


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




In [55]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import gradio as gr
import re

# Load Pegasus model and tokenizer
pegasus_model_path = './results/pegasus-small'
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)

def preprocess_text(text):
    # Remove extra spaces, newlines, and tabs
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def generate_pegasus_summary(text):
    try:
        # Preprocess the text
        text = preprocess_text(text)

        # Add a lead-in prompt to the text
        prompt = "Summarize the following: "
        text = prompt + text

        # Tokenize the input text
        inputs = pegasus_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

        # Generate the summary
        summary_ids = pegasus_model.generate(
            **inputs,
            max_length=150,  # Adjust this if necessary
            min_length=30,   # Set a minimum length to encourage longer summaries
            num_beams=6,    # Reduce the beams to encourage diversity
            length_penalty=1.0,  # You can adjust this
            no_repeat_ngram_size=3,  # Prevents repetition of 3-grams
            early_stopping=False,  # Allow the model to continue until it generates a sufficient summary
            temperature=0.9,  # Increase temperature for more randomness
            top_k=50,  # Consider adding top_k for better diversity
            top_p=0.9,  # Consider adding top_p for better diversity
        )

        # Decode the summary
        summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Post-process to remove the first line if it closely matches the input
        input_first_sentence = text.split('.')[0]
        if summary.startswith(input_first_sentence):
            summary = summary[len(input_first_sentence):].strip()

        return summary

    except Exception as e:
        return f"An error occurred: {str(e)}"

# Interface using Gradio
iface = gr.Interface(
    fn=generate_pegasus_summary, 
    inputs=gr.Textbox(lines=10, label="Input Text"),
    outputs="text",
    title="Dynamic Pegasus Summarizer",
    description="Generate a summary using the fine-tuned Pegasus model with improved dynamicity."
)

iface.launch()

Running on local URL:  http://127.0.0.1:7875

To create a public link, set `share=True` in `launch()`.


