In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
import torch
import numpy as np
import logging
import warnings

logging.getLogger().setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
print(device)

In [None]:
model = model.to(device)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
docs_txt_folder =   '/content/drive/MyDrive/docs_txt/'
filenames_without_extension = [] #title of each file .
file_contents = [] #content of each file
formatted_documents = []

# Get a list of all files in the folder
files = os.listdir(docs_txt_folder)
# Sort the files alphabetically
files.sort()
#finds out the document title and content
for filename in files:
    if filename.endswith('.txt'):
        basename, extension = os.path.splitext(filename)
        filenames_without_extension.append(basename)

        # Read the content of the file
        filepath = os.path.join(docs_txt_folder, filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            file_contents.append(content)
        formatted_document = f"<|title|>{basename}</|title|>{content}<|endoftext|>" #adds special tokens which helps in the training process(optional)
        formatted_documents.append(formatted_document)


In [None]:
from torch.utils.data import Dataset, random_split

class TextDataset(Dataset):
    def __init__(self, formatted_documents):
        self.formatted_documents = formatted_documents

    def __len__(self):
        return len(self.formatted_documents)

    def __getitem__(self, idx):
        return {"text": self.formatted_documents[idx]}

# Create the dataset
documents = TextDataset(formatted_documents)


In [None]:
from datasets import Dataset
import pandas as pd
df = pd.DataFrame(formatted_documents, columns=['text'])
dataset = Dataset.from_pandas(df)

def preprocess_function(examples):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(preprocess_function, batched=True)



In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
)

In [None]:
from transformers import Trainer
#finally creating the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,)


In [None]:
trainer.train()