In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import re
import string
import random
import torch
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

In [3]:
data = pd.read_csv("QA1.csv")

In [4]:
data = data.apply(lambda x: x.str.encode('utf-8').str.decode('utf-8'))

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:

tokenized_data =  data["answers"].apply(lambda x: tokenizer.encode(x, truncation=True))

In [7]:
# Define padding function
def pad_sequence(seq, max_len):
    seq_len = len(seq)
    if seq_len < max_len:
        seq = seq + [0] * (max_len - seq_len)
    elif seq_len > max_len:
        seq = seq[:max_len]
    return seq


In [8]:
# Pad sequences
max_len = 200
padded_data = [pad_sequence(seq, max_len) for seq in tokenized_data.values]
# Convert integers to strings and pad with spaces
padded_strings = [[' ' + chr(i) for i in seq] for seq in padded_data]
padded_strings = [[s[1:] for s in seq] for seq in padded_strings]
padded_strings = [pad_sequence(seq, max_len) for seq in padded_strings]
# Vectorize padded data
vectorized_data = np.vectorize(lambda x: ord(x))(padded_strings)

In [9]:
# Flatten padded_data
padded_data = np.array([np.ravel(x) for x in padded_data]).astype(np.int64)

# Create attention masks
attention_masks = np.where(padded_data != 0, 1, 0)

In [10]:
from sklearn.model_selection import train_test_split

train_data, val_data, train_masks, val_masks = train_test_split(padded_data, attention_masks, test_size=0.1)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_data), torch.tensor(train_masks))
val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_data), torch.tensor(val_masks))

batch_size = 8
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*10)


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [12]:
epochs = 6

for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        inputs, masks = (i.to(device) for i in batch)
        model.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    # Evaluate the model on the validation set
    model.eval()
    total_eval_loss = 0
    for batch in val_dataloader:
        inputs, masks = (i.to(device) for i in batch)
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=inputs)
            loss = outputs[0]
            total_eval_loss += loss.item()
    
    avg_eval_loss = total_eval_loss / len(val_dataloader)
    print("Average validation loss: {}".format(avg_eval_loss))

Average train loss: 4.956582762978294
Average validation loss: 2.825577139854431
Average train loss: 2.733716531233354
Average validation loss: 2.0184157490730286
Average train loss: 2.4163750518452036
Average validation loss: 1.9052411913871765
Average train loss: 2.2538670193065298
Average validation loss: 1.8454601764678955
Average train loss: 2.196279612454501
Average validation loss: 1.8090587258338928
Average train loss: 2.1663035587830977
Average validation loss: 1.7827946543693542


In [13]:
# Save the model
torch.save(model.state_dict(), 'fine_tune.pt')

In [14]:
# Download the model
from google.colab import files
files.download('fine_tune.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
# Set the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Set the model to use evaluation mode
model.eval()
# Set the seed for reproducibility
torch.manual_seed(42)
prompt = "Est-ce qu’une personne physique soumise à l’impôt sur le revenu selon le régime forfaitaire dans la catégorie des « bénéfices industriels et commerciaux » est soumise à la TFP?"

In [24]:
# Generate text
generated_text = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(device)
output_text = model.generate(
    generated_text,
    max_length=200,
    temperature=0.8,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    num_return_sequences=1,
)

# Decode the generated text
decoded_output = tokenizer.decode(output_text[0], skip_special_tokens=True)

print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Est-ce qu’une personne physique soumise à l’impôt sur le revenu selon le régime forfaitaire dans la catégorie des « bénéfices industriels et commerciaux » est soumise à la TFP?!!!" exclaimed the crowd.
\r*^!'T'assure un peut jour du jour de définement pour résistance par une même économique, cœur les jeunes sont entreprises exerteurs aux chambres qui ne revoir que ce faut pas en français : il nous savant au vivant où pendant iciée autoutes recherches suprised toute comprends dispatchers ont voilà état demande prisitifilitérale?!,—avez nos filons recomm
