In [2]:
!pip install transformers
!pip install Flask  




In [11]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")

Downloading readme:   0%|          | 0.00/31.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [4]:
pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=8.0.0 (from datasets)
  Downloading pyarrow-14.0.2-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.1-cp311-cp311-win_amd64.whl.metadata (7.6 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)
  Downloading multidict-6.0.4-cp311-cp311-win_am

In [5]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")  # Load CNN/DailyMail dataset


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [6]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Load your dataset (e.g., CNN/DailyMail)
dataset = load_dataset("cnn_dailymail", "3.0.0")
train_texts = dataset['train']['article'][:1000]  # Adjust the dataset size as needed

# Load pre-trained model and tokenizer
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
tokenized_dataset = tokenizer(train_texts, truncation=True, padding=True)

# Define a custom collate function
def collate_fn(batch):
    return tokenizer(batch['input_ids'], truncation=True, padding=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./custom_summarizer',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
)

# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=collate_fn,
)

#trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
from flask import Flask, render_template, request
from transformers import T5ForConditionalGeneration, T5Tokenizer

app = Flask(__name__)

# Load your trained summarization model and tokenizer
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define personalized prompt templates
prompt_templates = {
    'objective': 'In an objective manner, summarize the article: {}',
    'factual': 'Provide a factual summary of the article: {}',
    'humorous': 'Summarize the article in a humorous way: {}',
}

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/summarize', methods=['POST'])
def summarize():
    article = request.form['article']
    summary_length = int(request.form['summary_length'])
    style = request.form['style']

    # Design personalized prompt based on user choices
    prompt = prompt_templates.get(style, 'Summarize the article: {}').format(article)

    # Generate personalized summary
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(**inputs, max_length=summary_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    personalized_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return render_template('index.html', article=article, personalized_summary=personalized_summary)

if __name__ == '__main__':
    app.run(debug=True)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [13]:
pip install accelerate -U


Collecting accelerateNote: you may need to restart the kernel to use updated packages.

  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
   ---------------------------------------- 0.0/270.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/270.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/270.9 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/270.9 kB 325.1 kB/s eta 0:00:01
   ----- --------------------------------- 41.0/270.9 kB 217.9 kB/s eta 0:00:02
   -------- ------------------------------ 61.4/270.9 kB 297.7 kB/s eta 0:00:01
   ---------- ---------------------------- 71.7/270.9 kB 280.5 kB/s eta 0:00:01
   ------------- ------------------------- 92.2/270.9 kB 348.6 kB/s eta 0:00:01
   --------------- ---------------------- 112.6/270.9 kB 327.2 kB/s eta 0:00:01
   --------------- ---------------------- 112.6/270.9 kB 327.2 kB/s eta 0:00:01
   ------------