#Importing dependencies

In [38]:
!pip install accelerate --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [39]:
!pip install datasets --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [40]:
!pip install torch torchvision torchaudio --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [41]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import pandas as pd
import numpy as np

# Data

In [42]:
from datasets import load_dataset

dataset = load_dataset("EdinburghNLP/xsum")

In [43]:
dataset.shape

{'train': (204045, 3), 'validation': (11332, 3), 'test': (11334, 3)}

In [44]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [45]:
from datasets import concatenate_datasets

# Concatenate the validation and test sets to the training set
combined_dataset = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])

# Print the shape of the combined dataset
print(combined_dataset.shape)


(226711, 3)


In [46]:
combined_dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 226711
})

In [47]:
import random

random.seed(42)
sample_size = 22000
sample_indices = random.sample(range(len(combined_dataset)), sample_size)
sampled_dataset = combined_dataset.select(sample_indices)

In [48]:
sampled_dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 22000
})

#Model

In [49]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [50]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    # Get the name of the GPU
    device = torch.cuda.get_device_name(0)
    print(f'T5 model is running on GPU: {device}')
else:
    print('T5 model is running on CPU')


T5 model is running on GPU: Tesla P100-PCIE-16GB


In [51]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


#Tokenize Data

In [52]:
def tokenize_function(example):
    start_prompt = 'Generate news article about the following Text.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["summary"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["document"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

In [None]:
tokenized_datasets = sampled_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id','document', 'summary',])

Map:   0%|          | 0/22000 [00:00<?, ? examples/s]

#Hugging Face

In [None]:
from huggingface_hub import notebook_login
notebook_login()


#Training Arguments

In [None]:
import os

# define the name of the directory to be created
path = "/kaggle/working/t5"

try:
    os.mkdir(path)
except OSError:
    print(f"Creation of the directory {path} failed")
else:
    print(f"Successfully created the directory {path}")


In [None]:
training_args = TrainingArguments(
    output_dir=path,
    push_to_hub=True,
    push_to_hub_model_id='Fake-news-gen',
    push_to_hub_token='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',
    logging_strategy="epoch",
    #evaluation_strategy="steps",
    num_train_epochs=10,
    auto_find_batch_size=True,
    #bf16=True,
    #eval_steps=500,
    save_total_limit=3,
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets

)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
tokenizer.save_pretrained("Fake-news-gen")
tokenizer.push_to_hub("Fake-news-gen")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('Ahmedhany216/Fake-news-gen')
tokenizer = AutoTokenizer.from_pretrained('Ahmedhany216/Fake-news-gen')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def generate_article(input_text, model_name, max_length=500):
    start_prompt = 'Generate news article about the following Text.\n\n'
    end_prompt = '\n\nSummary: '
    # Load pre-trained model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Concatenate the prompts
    prompt = f'{start_prompt}{input_text}{end_prompt}'

    # Tokenize the input text
    input_ids = tokenizer.encode(prompt, return_tensors='pt', max_length=max_length, truncation=True)

    # Generate article
    output = model.generate(input_ids, max_length=max_length, num_beams=5,do_sample=tr ,length_penalty=0.6, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

    # Decode and return the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example usage
input_text = "the persident was killed in the conference"
generated_article = generate_article(input_text, 'Ahmedhany216/Fake-news-gen')
print(generated_article)
