In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-12-11 06:33:37.118217: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-11 06:33:37.131507: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733916817.153360   12328 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733916817.160028   12328 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-11 06:33:37.184783: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in o

In [2]:
df = pd.read_csv("data/reminiscences_of_a_stock_operator_qa.csv", sep="\t")

In [3]:
df.head()

Unnamed: 0,question,answer
0,What was your first job in finance?,My first job was as a quotation-board boy at a...
1,What key lessons did you learn from your first...,I learned the importance of quick mental calcu...
2,How did your early skills in math affect your ...,"My strong math skills, especially mental arith..."
3,Describe your first experience making money in...,My first profitable trade was on Burlington. ...
4,What was your initial trading strategy?,My initial strategy focused on recognizing pat...


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Set the pad_token to eos_token if not already set
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        question = row['question']
        answer = row['answer']
        
        # Prepare the input for GPT-2
        input_text = f"Question: {question}\nAnswer:"
        target_text = answer
        
        # Tokenize the input and the target (GPT-2 is a causal language model)
        encoding = self.tokenizer(
            input_text,
            target_text,
            truncation=True,
            padding="max_length",  # Enable padding here
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding['input_ids'].squeeze(0),
            "attention_mask": encoding['attention_mask'].squeeze(0),
            "labels": encoding['input_ids'].squeeze(0)  # GPT-2 uses causal language modeling
        }


In [6]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [7]:
MAX_LEN = 128
BATCH_SIZE = 4
EPOCHS = 50
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
train_dataset = QADataset(train_df, tokenizer, MAX_LEN)
test_dataset = QADataset(test_df, tokenizer, MAX_LEN)

In [9]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
model.to(DEVICE)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)



In [12]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [13]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [14]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train(model, train_loader, optimizer, DEVICE)
    val_loss = evaluate(model, test_loader, DEVICE)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

Epoch 1/50


Training: 100%|██████████| 119/119 [01:17<00:00,  1.53it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.23it/s]


Train Loss: 1.4724, Validation Loss: 1.0485
Epoch 2/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.64it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.10it/s]


Train Loss: 1.0392, Validation Loss: 0.9554
Epoch 3/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.64it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.25it/s]


Train Loss: 0.9262, Validation Loss: 0.9101
Epoch 4/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.65it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.48it/s]


Train Loss: 0.8451, Validation Loss: 0.8808
Epoch 5/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.64it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.22it/s]


Train Loss: 0.7770, Validation Loss: 0.8623
Epoch 6/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.64it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.19it/s]


Train Loss: 0.7257, Validation Loss: 0.8477
Epoch 7/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.64it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.11it/s]


Train Loss: 0.6775, Validation Loss: 0.8439
Epoch 8/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.65it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.05it/s]


Train Loss: 0.6323, Validation Loss: 0.8423
Epoch 9/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.64it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.10it/s]


Train Loss: 0.5869, Validation Loss: 0.8453
Epoch 10/50


Training: 100%|██████████| 119/119 [01:12<00:00,  1.63it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.30it/s]


Train Loss: 0.5473, Validation Loss: 0.8483
Epoch 11/50


Training: 100%|██████████| 119/119 [01:16<00:00,  1.55it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.23it/s]


Train Loss: 0.5112, Validation Loss: 0.8560
Epoch 12/50


Training: 100%|██████████| 119/119 [01:16<00:00,  1.56it/s]
Evaluating: 100%|██████████| 30/30 [00:17<00:00,  1.75it/s]


Train Loss: 0.4774, Validation Loss: 0.8642
Epoch 13/50


Training: 100%|██████████| 119/119 [04:04<00:00,  2.06s/it]
Evaluating: 100%|██████████| 30/30 [00:16<00:00,  1.80it/s]


Train Loss: 0.4432, Validation Loss: 0.8848
Epoch 14/50


Training: 100%|██████████| 119/119 [02:34<00:00,  1.30s/it]
Evaluating: 100%|██████████| 30/30 [00:05<00:00,  5.70it/s]


Train Loss: 0.4167, Validation Loss: 0.8906
Epoch 15/50


Training: 100%|██████████| 119/119 [03:36<00:00,  1.82s/it]
Evaluating: 100%|██████████| 30/30 [00:17<00:00,  1.76it/s]


Train Loss: 0.3867, Validation Loss: 0.9097
Epoch 16/50


Training: 100%|██████████| 119/119 [07:38<00:00,  3.86s/it]
Evaluating: 100%|██████████| 30/30 [00:29<00:00,  1.02it/s]


Train Loss: 0.3616, Validation Loss: 0.9124
Epoch 17/50


Training: 100%|██████████| 119/119 [09:21<00:00,  4.72s/it]
Evaluating: 100%|██████████| 30/30 [00:31<00:00,  1.03s/it]


Train Loss: 0.3338, Validation Loss: 0.9476
Epoch 18/50


Training: 100%|██████████| 119/119 [08:49<00:00,  4.45s/it]
Evaluating: 100%|██████████| 30/30 [00:30<00:00,  1.03s/it]


Train Loss: 0.3153, Validation Loss: 0.9515
Epoch 19/50


Training: 100%|██████████| 119/119 [09:04<00:00,  4.58s/it]
Evaluating: 100%|██████████| 30/30 [00:29<00:00,  1.01it/s]


Train Loss: 0.2939, Validation Loss: 0.9688
Epoch 20/50


Training: 100%|██████████| 119/119 [09:22<00:00,  4.73s/it]
Evaluating: 100%|██████████| 30/30 [00:36<00:00,  1.20s/it]


Train Loss: 0.2727, Validation Loss: 0.9858
Epoch 21/50


Training: 100%|██████████| 119/119 [09:22<00:00,  4.72s/it]
Evaluating: 100%|██████████| 30/30 [00:35<00:00,  1.18s/it]


Train Loss: 0.2607, Validation Loss: 0.9976
Epoch 22/50


Training: 100%|██████████| 119/119 [08:10<00:00,  4.12s/it]
Evaluating: 100%|██████████| 30/30 [00:33<00:00,  1.10s/it]


Train Loss: 0.2454, Validation Loss: 1.0096
Epoch 23/50


Training: 100%|██████████| 119/119 [08:50<00:00,  4.46s/it]
Evaluating: 100%|██████████| 30/30 [00:32<00:00,  1.07s/it]


Train Loss: 0.2264, Validation Loss: 1.0241
Epoch 24/50


Training: 100%|██████████| 119/119 [08:44<00:00,  4.41s/it]
Evaluating: 100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Train Loss: 0.2162, Validation Loss: 1.0233
Epoch 25/50


Training: 100%|██████████| 119/119 [08:31<00:00,  4.30s/it]
Evaluating: 100%|██████████| 30/30 [00:29<00:00,  1.01it/s]


Train Loss: 0.2083, Validation Loss: 1.0336
Epoch 26/50


Training: 100%|██████████| 119/119 [01:47<00:00,  1.11it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.08it/s]


Train Loss: 0.1958, Validation Loss: 1.0593
Epoch 27/50


Training: 100%|██████████| 119/119 [01:15<00:00,  1.58it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.11it/s]


Train Loss: 0.1894, Validation Loss: 1.0515
Epoch 28/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.10it/s]


Train Loss: 0.1784, Validation Loss: 1.0654
Epoch 29/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.38it/s]


Train Loss: 0.1739, Validation Loss: 1.0732
Epoch 30/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.59it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.15it/s]


Train Loss: 0.1632, Validation Loss: 1.0918
Epoch 31/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.15it/s]


Train Loss: 0.1558, Validation Loss: 1.0911
Epoch 32/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.43it/s]


Train Loss: 0.1542, Validation Loss: 1.0946
Epoch 33/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.16it/s]


Train Loss: 0.1479, Validation Loss: 1.1152
Epoch 34/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.43it/s]


Train Loss: 0.1426, Validation Loss: 1.1120
Epoch 35/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.41it/s]


Train Loss: 0.1371, Validation Loss: 1.1336
Epoch 36/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.20it/s]


Train Loss: 0.1333, Validation Loss: 1.1353
Epoch 37/50


Training: 100%|██████████| 119/119 [01:15<00:00,  1.59it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.19it/s]


Train Loss: 0.1294, Validation Loss: 1.1432
Epoch 38/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.44it/s]


Train Loss: 0.1249, Validation Loss: 1.1460
Epoch 39/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.16it/s]


Train Loss: 0.1227, Validation Loss: 1.1475
Epoch 40/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.59it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.18it/s]


Train Loss: 0.1195, Validation Loss: 1.1508
Epoch 41/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.42it/s]


Train Loss: 0.1148, Validation Loss: 1.1657
Epoch 42/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.47it/s]


Train Loss: 0.1141, Validation Loss: 1.1638
Epoch 43/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.19it/s]


Train Loss: 0.1107, Validation Loss: 1.1816
Epoch 44/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.46it/s]


Train Loss: 0.1077, Validation Loss: 1.1816
Epoch 45/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.20it/s]


Train Loss: 0.1073, Validation Loss: 1.1982
Epoch 46/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.20it/s]


Train Loss: 0.1034, Validation Loss: 1.2077
Epoch 47/50


Training: 100%|██████████| 119/119 [01:13<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.19it/s]


Train Loss: 0.1012, Validation Loss: 1.1951
Epoch 48/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.61it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.15it/s]


Train Loss: 0.1020, Validation Loss: 1.2127
Epoch 49/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.14it/s]


Train Loss: 0.0989, Validation Loss: 1.2086
Epoch 50/50


Training: 100%|██████████| 119/119 [01:14<00:00,  1.60it/s]
Evaluating: 100%|██████████| 30/30 [00:05<00:00,  5.77it/s]

Train Loss: 0.0973, Validation Loss: 1.2172





In [21]:
# Save the fine-tuned model
model.save_pretrained("./gpt2-chatbot")
tokenizer.save_pretrained("./gpt2-chatbot")

print("Model fine-tuned and saved!")

Model fine-tuned and saved!


In [22]:
def chat_with_gpt2(model, tokenizer, question, device="cpu"):
    model.eval()
    
    # Prepare the input prompt for the model
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate the answer
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"], 
            max_length=128,  # You can adjust this to control the length of the answer
            num_beams=5,     # Beam search for better quality
            early_stopping=True
        )

    # Decode and return the answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [23]:
# Example usage:
question = "What is the main theme of the book?"
answer = chat_with_gpt2(model, tokenizer, question, device=DEVICE)
print(f"Answer: {answer}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: Question: What is the main theme of the book?
Answer:The primary theme is greed and fear, the need to avoid overtrading and impulsive decision-making, and the need to maintain discipline and composure in the face of market manipulation.  It's about understanding the combination of greed and fear with rational analysis and decisive action.


In [24]:
# Example usage:
question = "What is the correct time to enter the market?"
answer = chat_with_gpt2(model, tokenizer, question, device=DEVICE)
print(f"Answer: {answer}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: Question: What is the correct time to enter the market?
Answer:The correct time to enter the market is when you have a good understanding of market trends and a good understanding of the fundamental reasons behind price movements.  You must also be prepared to make rational, well-timed trades to capitalize on the market's movements.


In [25]:
# Example usage:
question = "What was your first job in finance?"
answer = chat_with_gpt2(model, tokenizer, question, device=DEVICE)
print(f"Answer: {answer}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer: Question: What was your first job in finance?
Answer:My first job was as a quotation-board boy at a stock brokerage firm. I was quick with numbers and excelled at mental arithmetic, skills that proved invaluable later in my career.
