In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained model and tokenizer with a different size
model_name = "gpt2-medium"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load and preprocess the training data
train_file = r"C:\Users\inevi\Folder6= BusinessQuant task LLM fine tuning\sample_data.txt.txt"

train_dataset = TextDataset( tokenizer=tokenizer, file_path=train_file, block_size=256)

data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False)

# Fine-tune the model
output_dir = "./fine-tuned-model"
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Adjust batch size as needed
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=2,  # Adjust as needed
    learning_rate=5e-5,  # Experiment with learning rate
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()

# Save the fine-tuned model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


***** Running training *****
  Num examples = 30
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 6
  Number of trainable parameters = 354823168


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in ./fine-tuned-model\config.json
Model weights saved in ./fine-tuned-model\pytorch_model.bin
tokenizer config file saved in ./fine-tuned-model\tokenizer_config.json
Special tokens file saved in ./fine-tuned-model\special_tokens_map.json


('./fine-tuned-model\\tokenizer_config.json',
 './fine-tuned-model\\special_tokens_map.json',
 './fine-tuned-model\\vocab.json',
 './fine-tuned-model\\merges.txt',
 './fine-tuned-model\\added_tokens.json')

In [13]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_name = "./fine-tuned-model"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your dataset into a pandas DataFrame (replace this with your actual data loading)
# For example, assuming you have a CSV file 'your_dataset.csv'
dataset_path = 'LLM-Sample-Input-File.csv'
df = pd.read_csv(dataset_path)

def generate_response(question, df, filter_attributes=None, generated_sequence=None):
    # Tokenize the input question
    input_ids = tokenizer.encode(question, return_tensors="pt").to(device)

    # Generate a response
    output = model.generate(
        input_ids,
        max_length=50,
        num_beams=5,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )

    # Decode the generated output
    generated_sequence = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the number from the generated sequence using your logic
    generated_number = extract_number(generated_sequence, df, question, filter_attributes)

    return generated_number

def extract_number(generated_sequence, df, question, filter_attributes=None):
    # Default conditions (e.g., quarter and year)
    conditions = [df['Period'] == '2019 Q1']

    # Additional conditions based on the question style
    if 'Potato Inc.' in question:
        conditions.append(df['Company Name'] == 'Potato Inc.')
    # Add more conditions based on other entities in your dataset

    # Additional conditions based on specified filter attributes
    if filter_attributes:
        for attribute, value in filter_attributes.items():
            # Assuming the attribute is present in the DataFrame
            conditions.append(df[attribute] == value)

    # Apply conditions to filter the DataFrame
    filtered_df = df
    for condition in conditions:
        filtered_df = filtered_df[condition]

    # Extract the specific number from the filtered DataFrame
    # This assumes 'Value - Randomized' is the column containing the desired numbers
    number = filtered_df['Value - Randomized'].values[0] if not filtered_df.empty else None

    return number

# Example query
question = "What is the revenue in quarter 1 of year 2019 for Potato Inc. in Rest of Asia Pacific?"
filter_attributes = {'Category': 'Revenue By Region', 'Sub Cat': 'Rest of Asia Pacific'}
generated_sequence = {'Category': 'Revenue By Region', 'Sub Cat': 'Rest of Asia Pacific'}
response = generate_response(question, df, filter_attributes, generated_sequence)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


20784000000.0


  filtered_df = filtered_df[condition]
