In [None]:
!pip install transformers datasets wandb

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from huggingface_hub import login
#Removed login for security
login("")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from sklearn.model_selection import KFold
import pandas as pd
from datasets import Dataset
import wandb
import gc

# Load the dataset
data_path = 'relevant_questions_answers.xlsx'  # Your file path
df = pd.read_excel(data_path, nrows=500)  # Smaller subset to fit in memory
df = df[['Relevant_Question', 'Relevant_Answer']]

# Initialize W&B
wandb.login()
wandb.init(project="AutoProphet")

# Load the tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Define tokenization function
def tokenize_function(examples):
    inputs = tokenizer(
        [f"{q} {a}" for q, a in zip(examples["Relevant_Question"], examples["Relevant_Answer"])],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    inputs["labels"] = inputs["input_ids"]
    return inputs

# Define cross-validation parameters
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Prepare dataset for cross-validation
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_function, batched=True, remove_columns=["Relevant_Question", "Relevant_Answer"])

# Cross-validation loop
results = []
for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"Starting fold {fold + 1}/{k}")

    # Create train and validation datasets for this fold
    train_dataset = dataset.select(train_idx)
    eval_dataset = dataset.select(val_idx)

    # Reload the model for each fold
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f'./logs_fold_{fold}',
        load_best_model_at_end=True,
        save_strategy="epoch",
        report_to="wandb",
        gradient_checkpointing=True,
        fp16=True,
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    results.append(eval_result)

    # Log the evaluation results for each fold
    wandb.log({f"fold_{fold + 1}_eval": eval_result})

    # Clear memory
    del model, trainer, train_dataset, eval_dataset
    gc.collect()

# Calculate average metrics across all folds
average_metrics = {key: sum([result[key] for result in results]) / k for key in results[0]}
print("Average cross-validation results:", average_metrics)

# Finish W&B run
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmsbah[0m ([33mmsbah-michigan-technological-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Starting fold 1/5


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.821047
2,No log,2.748776
3,2.806100,2.738824


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Starting fold 2/5


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.708591
2,No log,2.614606
3,2.835700,2.597106


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Starting fold 3/5


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.652262
2,No log,2.566031
3,2.835300,2.55206


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Starting fold 4/5


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.678909
2,No log,2.605318
3,2.845700,2.592556


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Starting fold 5/5


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.681713
2,No log,2.587062
3,2.852600,2.568463


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Average cross-validation results: {'eval_loss': 2.6098018646240235, 'eval_runtime': 0.84108, 'eval_samples_per_second': 64.615, 'eval_steps_per_second': 64.615, 'epoch': 3.0}


VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▆▆▅▃▂▂▄▁▁▁▄▂▂▂▄▂▁▁
eval/runtime,▁▂▂▂▂██▁▆▁▂▁▂▂▂▁▁▂▂█
eval/samples_per_second,█▇█▆▇▁▁█▃█▇▇▇▇▇▇█▇▇▁
eval/steps_per_second,█▇█▆▇▁▁█▃█▇▇▇▇▇▇█▇▇▁
train/epoch,▁▅▆███▁▅▆███▁▅▆███▁▅▆███▁▅▆███
train/global_step,▁▄▆████▁▄▆████▁▅▆████▁▅▆████▁▅▆████
train/grad_norm,▄▃▁▇█
train/learning_rate,▁▃█▆█
train/loss,▁▅▅▇█

0,1
eval/loss,2.56846
eval/runtime,1.0779
eval/samples_per_second,49.168
eval/steps_per_second,49.168
total_flos,41937371136000.0
train/epoch,3.0
train/global_step,642.0
train/grad_norm,25.00816
train/learning_rate,0.0
train/loss,2.8526


In [None]:
import torch

# Reload the model after cross-validation for generating answers
model = GPT2LMHeadModel.from_pretrained(model_name)
model.to("cuda" if torch.cuda.is_available() else "cpu")  # Ensure model is on the correct device

# Function to generate answers
def generate_answer(question):
    # Tokenize the question
    inputs = tokenizer(question, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate an answer using the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=200,  # Set the maximum length for the generated text
            min_length=100,  # Set the minimum length for the generated text
            do_sample=True,   # Use sampling to generate more diverse responses
            top_k=50,        # Limit the sampling to the top 50 words
            top_p=0.95,      # Use nucleus sampling
            num_return_sequences=1,  # Generate one sequence
        )

    # Decode the generated tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Generate an answer
question = "Is financial information limited solely to figures that can be stated in monetary terms?"
answer = generate_answer(question)
print("Generated answer:", answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated answer: Is financial information limited solely to figures that can be stated in monetary terms? A recent report from CNP Group, for instance, reveals that the Canadian dollar is valued at about $1.30, not far behind the US dollar. It's a surprise to see a US dollar as close as $1.19 in this example.

That said, if the US dollar is too similar to the Canadian dollar to be considered the true US dollar, the value of each Canadian dollar has to be interpreted in a different way than the US dollar is valued today. If the Canadian dollar is valued more like the US dollar, for instance, then Canada has less of a chance of rising to the top of the market.

This suggests that if the Canadian dollar was valued as such today it should be able to stand on its own, rather than being divided on its own basis by foreign investors.

The value of Canadian currency can also be influenced by factors other than market sentiment. It can


In [None]:
# T5 Model

In [None]:
# Install necessary libraries
!pip install transformers datasets

import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import torch

# Step 1: Load and prepare the data
data_path = 'demo.xlsx'  # Path to your finance data file
df = pd.read_excel(data_path)  # Load data

# Assuming the data has columns 'Question' and 'Answer'
df = df[['Question', 'Answer']]

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Step 2: Load the tokenizer and model
model_name = "t5-small"  # Use "t5-base" for a larger model if resources allow
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define tokenization function
def preprocess_data(examples):
    # Format input text for T5 as "question: <question>"
    inputs = ["question: " + q for q in examples['Question']]
    targets = [a for a in examples['Answer']]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
dataset = dataset.map(preprocess_data, batched=True, remove_columns=["Question", "Answer"])

# Split into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Step 3: Set up training arguments
training_args = TrainingArguments(
    output_dir="./finance_t5",
    evaluation_strategy="epoch",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save at the end of each epoch
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,                # Keep only the best checkpoint
    load_best_model_at_end=True,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Step 4: Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./finance_t5_model")
tokenizer.save_pretrained("./finance_t5_model")





Map:   0%|          | 0/868 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.046807
2,No log,1.41072
3,2.761800,1.239169


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./finance_t5_model/tokenizer_config.json',
 './finance_t5_model/special_tokens_map.json',
 './finance_t5_model/spiece.model',
 './finance_t5_model/added_tokens.json')

In [None]:
# Test the trained model on a sample question after training
def generate_answer(question):
    # Format the input as T5 expects
    input_text = "question: " + question
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=50,
            num_beams=3,
            early_stopping=True
        )

    # Decode and return answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Example question to test
question = "Can the Financial Markets index be used globally?"
answer = generate_answer(question)
print("Generated Answer:", answer)


Generated Answer: Financial Markets Index (FMC) is an aggregate of the Financial Markets index (FMC), which compiles data on the financial markets of the United States and Canada. Financial Markets index (FMC), which compiles
