In [None]:
# -*- coding: utf-8 -*-
"""FineTuning.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1Fj-pVzk0N1yxTdZ75sNJ-DTKaloNDEyO
"""

from google.colab import files
uploaded = files.upload()

# Read the content of skincare.txt
with open('skincare.txt', 'r', encoding='utf-8') as file:
    content = file.read()
print(content[:500])  # preview first 500 characters

!pip install transformers torch accelerate sentencepiece --quiet

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/flan-t5-base",
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
    device_map='auto'
)

def extract_keywords(text, max_keywords=15):
    prompt = (f"Extract exactly {max_keywords} distinct keywords or key phrases from the following text:\n\n{text}\n\nKeywords:")
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    print("Tokenizer input keys:", inputs.keys())
    outputs = model.generate(
        **inputs,
        max_length=200,
        do_sample=True,
        temperature=0.7,
        top_k=60,
        top_p=0.95,
        repetition_penalty=1.2,
        num_return_sequences=1
    )
    keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)
    keywords_list = [kw.strip() for kw in keywords.split(",") if kw.strip()]
    return keywords_list


# Test
keywords = extract_keywords(content, max_keywords=15)
print("Extracted Keywords:", keywords)

!pip install transformers datasets torch accelerate

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

from google.colab import files
uploaded = files.upload()  # Upload skincare_keywords_dataset.jsonl manually

from datasets import load_dataset
# load the dataset from your uploaded file
dataset = load_dataset('json', data_files='skincare.jsonl', split='train')
# split dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto', torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)

max_input_length = 256
max_output_length = 64
def preprocess(examples):
    inputs = ["Extract keywords: " + text for text in examples['text']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(examples['keywords'], max_length=max_output_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = dataset.map(preprocess, batched=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-keywords",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("./flan-t5-keywords-finetuned")

from transformers import pipeline
model = AutoModelForSeq2SeqLM.from_pretrained("./flan-t5-keywords-finetuned")
tokenizer = AutoTokenizer.from_pretrained("./flan-t5-keywords-finetuned")
keyword_extractor = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

from google.colab import files
uploaded = files.upload()
# Extract filename from uploaded file
filename = list(uploaded.keys())[0]
# Read the file content
with open(filename, "r") as file:
    text = file.read()
# Run inference with the loaded text
result = keyword_extractor(f"Extract keywords: {text}", max_length=64)
print("Extracted Keywords:", result[0]['generated_text'])