In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from datasets import Dataset

In [24]:
stock_data = yf.download("SMCI", start = "2010-01-01", end = "2024-01-01", interval = '1d')

[*********************100%%**********************]  1 of 1 completed


In [25]:
sequences = [f"Day {i+1}: {price}" for i, price in enumerate(stock_data['Close'])]
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [26]:
# Fine tuning:

texts = "\n".join(sequences)
data = Dataset.from_dict({"text": [texts]})

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors = "pt", truncation = True, padding = "max_length", max_length = 512)
tokenized_data = data.map(tokenize_function, batched = True, remove_columns = ["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False,)

training_args = TrainingArguments(
    output_dir = "./results",
    overwrite_output_dir = True,
    num_train_epochs = 10,
    per_device_train_batch_size = 1,
    save_steps = 10000,
    save_total_limit = 2,
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_data,
)

trainer.train()

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=10, training_loss=1.9364513397216796, metrics={'train_runtime': 168.6329, 'train_samples_per_second': 0.059, 'train_steps_per_second': 0.059, 'total_flos': 2612920320000.0, 'train_loss': 1.9364513397216796, 'epoch': 10.0})

In [28]:
# Generate a sequence to predict the trend
new_data = yf.download("SMCI", start = "2024-01-01", end = "2024-01-30", interval = '1d')
#test_sequence = "Day 1: 150\nDay 2: 152\nDay 3: 151\nDay 4: 153\nDay 5: 155\n"
sequence = [f"Day {i+1}: {price}" for i, price in enumerate(new_data['Close'])]
inputs = tokenizer(sequence, return_tensors = 'pt', truncation = True, padding = True)
outputs = model.generate(**inputs, max_length = 100)
prediction = tokenizer.decode(outputs[0], skip_special_tokens = True)

print(f"The predicted trend sequence for SMCI is: \n{prediction}")

[*********************100%%**********************]  1 of 1 completed
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The predicted trend sequence for SMCI is: 
Day 1: 285.45001220703125

2: 285.451220703

3: 285.4512703

4: 285.4512703

5: 285.4512703


6: 285.4512703

7: 285.4512703

8: 285.4512703

9: 285.4512703

10: 285.4512703

11: 285
