In [None]:
!pip install atlassian-python-api

from atlassian import Confluence
import os

# Set up Confluence API connection
confluence = Confluence(
url='https://advendio.atlassian.net',
)
confluence
space_key = "SO"
pages = confluence.get_all_pages_from_space(space_key)
pages
# Create a directory to store the downloaded pages
if not os.path.exists('advendio_pages'):
    os.makedirs('advendio_pages')
# Download each page
for page in pages:
    page_id = page['id']
    page_title = page['title']
    page_filename = page_title.replace(' ', '_') + '.html'
    page_content = confluence.get_page_by_id(page_id, expand='body.storage')['body']['storage']['value']
    try:
        with open('advendio_pages/' + page_filename, 'w') as f:
            f.write(page_content)
    except:
        pass
    print('Downloaded:', page_filename)


In [None]:
!pip install transformers
!pip install datasets

from datasets import load_dataset
datasets = load_dataset("text", data_files = {"train" : "./advendio_pages/*.html"})
datasets

In [None]:
datasets = datasets['train']
datasets = datasets.train_test_split(test_size=0.2)
datasets['validation'] = datasets['test']
from transformers import AutoTokenizer


In [None]:
model_checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"])
    
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])


In [None]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)


In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
from transformers import Trainer, TrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-confluence",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets['validation'],
)
os.environ["WANDB_DISABLED"] = "true"


In [None]:
trainer.train()

