In [1]:
#!pip install accelerate -U
#!pip install transformers -U
import pandas as pd
from sklearn.model_selection import train_test_split
#!pip install datasets
from datasets import load_dataset

In [2]:
datapath = "/content/drive/MyDrive/NewsDataset.csv"
train_dataset = load_dataset("csv", data_files= datapath, split="train[:70%]")
test_dataset = load_dataset("csv", data_files= datapath, split="train[70%:90%]")
valid_dataset = load_dataset("csv", data_files= datapath, split="train[90%:]")

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

def tokenize(dataset):
  tokenized_news = dataset.map(preprocess_function, batched=True,
      num_proc=2, remove_columns=dataset.column_names)
  return tokenized_news

train_dataset_tokenized = tokenize(train_dataset)
test_dataset_tokenized = tokenize(test_dataset)
valid_dataset_tokenized = tokenize(valid_dataset)

In [5]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset_train = train_dataset_tokenized.map(group_texts, batched=True, num_proc=6)
lm_dataset_test = test_dataset_tokenized.map(group_texts, batched=True, num_proc=6)
lm_dataset_valid = valid_dataset_tokenized.map(group_texts, batched=True, num_proc=6)

In [6]:
from transformers import DataCollatorForLanguageModeling
#Use the end-of-sequence token as the padding token and set mlm=False.
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [31]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
access_token = "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"
model = AutoModelForCausalLM.from_pretrained("distilgpt2", token=access_token)

In [32]:
training_args = TrainingArguments(
    output_dir="news_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs = 2,
    per_device_train_batch_size = 20,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_test,
    data_collator=data_collator
)

#trainer.train()

In [33]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 44.26


In [None]:
 #trainer.save_model("/kaggle/working/model/news_clm-model")
#!tar cvf fine-tuned-model.tar.gz /kaggle/working/model/news_clm-model

tar: Removing leading `/' from member names
/kaggle/working/model/news_clm-model/
/kaggle/working/model/news_clm-model/config.json
/kaggle/working/model/news_clm-model/training_args.bin
/kaggle/working/model/news_clm-model/model.safetensors
/kaggle/working/model/news_clm-model/generation_config.json


## Exemples

In [28]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/news_clm-model") # <path_to_saved_model>

In [19]:
from transformers import pipeline
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
generator = pipeline("text-generation", model= model, tokenizer= tokenizer, device ="cuda")

In [None]:
from nltk.translate.bleu_score import sentence_bleu
#!pip install rouge
#!pip install rouge_score
from rouge import Rouge
from rouge_score import rouge_scorer
import nltk
import re
nltk.download('punkt')

In [17]:
def validset_preparation(min_word_count=5, max_word_count=20):
  sentences = []
  labels = []
  for text in valid_dataset['text']:
    sentence = nltk.sent_tokenize(text)
    word_pattern = re.compile(r'\b\w+\b')

    for phrase in sentence:
      phrase_split = word_pattern.findall(phrase)
      if min_word_count <= len(phrase_split) <= max_word_count:
        sentences.append(' '.join(phrase_split[:-1]))
        label = phrase_split[-1:]
        labels.append(label)
  return sentences, labels

sentences, labels = validset_preparation()

In [None]:
predict= []
bleu_scores = []
rouge_1_scores= []
rouge_2_scores= []
rouge_l_scores= []

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.WARNING)

for sentence in sentences:
  text = generator(sentence, num_return_sequences=1, return_full_text=False)
  predict_word = text[0]['generated_text'].split()[0]
  predict.append(predict_word)

  bleu_score = sentence_bleu(sentence, predict_word)
  bleu_score = round(bleu_score, 4)
  bleu_scores.append(bleu_score)