In [None]:
import torch
import os

# Check if a GPU is available and if not, use a CPU
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m143.4/232.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, \
    TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import Dataset

## GPT-2 Small ('gpt2'): 124 million parameters.
## GPT-2 Medium ('gpt2-medium'): 345 million parameters.
## GPT-2 Large ('gpt2-large'): 774 million parameters.
## GPT-2 XL ('gpt2-xl'): 1.5 billion parameters.


# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Your custom dataset
class CustomDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        with open(file_path, "r") as f:
            self.text = f.read().splitlines()
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        tokenized_inputs = self.tokenizer(
            self.text[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt")
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
        return tokenized_inputs

# Load data
data = CustomDataset(tokenizer, "data/Kenya-Rice-Cultivation-Manual.txt", 128)

# Create a data collator that will dynamically pad the sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments and Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=5, # Increse for more training from the fine-tuning data
    learning_rate=1e-4,  # Decrease the learning rate for smaller fine-tuning data
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,
    evaluation_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=None,  # You can specify an evaluation dataset here
    data_collator=data_collator,  # Add the data collator here
)

trainer.train()


In [None]:
# Ensure your model is in evaluation mode
# to disable dropout layers
model.eval()

# Create a prompt text for the model to complete
prompt_text = "Plants require at least 16 elements for normal growth and for completion of their life cycle."

# Tokenize the prompt text and convert to tensor
input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids
attention_mask = tokenizer(
    prompt_text, return_tensors="pt").attention_mask

# Move input_ids and attention_mask tensor to GPU
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Generate text from the model
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    max_length=1000,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    do_sample=True  # Enable sampling to consider temperature setting
)

# Decode the generated text back to string
generated_text = tokenizer.decode(output[0],
                                  skip_special_tokens=True)

print(generated_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Processing pdfs/Kenya Rice-Cultivation-Manual.pdf
Created 31 training examples

Dataset Statistics:
total_examples: 31.00
avg_text_length: 3125.06
avg_tokens: 473.45
min_tokens: 162.00
max_tokens: 512.00


In [None]:
## Retireve embeddings
input_text= "Cuidad Acuna, Mexico"
input_tokens = tokenizer(input_text, return_tensors='pt')

# Ensure tokens are on the same device as the model
input_tokens = {k: v.to(device) for k, v in input_tokens.items()}

# Forward pass, get hidden states
with torch.no_grad():
    outputs = model(**input_tokens, output_hidden_states=True)

# Only take the hidden states (ignore other outputs)
hidden_states = outputs.hidden_states

## If you want the embeddings from the last layer of the model:
last_layer_embeddings = hidden_states[-1]

## the last_layer_embeddings tensor obtained from the
# GPT-2 model's forward method is 3D

# Mean pool the last_layer_embeddings (across the sequence length dimension)
mean_pooled = last_layer_embeddings.mean(dim=1)

mean_pooled_embedding =  mean_pooled.squeeze(dim=0)


In [None]:
print(mean_pooled_embedding)


In [None]:
print(len(mean_pooled_embedding))
