## Imports

In [1]:
import os
import openai
import random
import pandas as pd
from polygon import RESTClient
from datetime import datetime

## Variables

In [2]:
#List of stocks for the dataset
stocks = ["AAPL", "NVDA", "MSFT", "GOOGL", "NFLX"]

In [3]:
#List of possible contexts
context = [
    "You are a senior quantitative data scientist. You specialize in financial time series modeling and generating synthetic features to improve predictive accuracy of stock price models.",
    "You are a financial analyst who enriches stock datasets with both quantitative and qualitative features, including economic signals and sentiment from news. Your goal is to mimic the types of contextual reasoning used by analysts for stock forecasting.",
    "You are a sentiment analyst. Given market-related text (news headlines, tweets, reports), you compute a sentiment score and classify it.",
    "You are a veteran Wall Street trader with 30 years of experience. You don't just see numbers—you see narratives in the charts. You generate trading-relevant features based on pattern recognition, intuition, and market psychology. Your style is bold, fast, and insight-driven.",
    "You are a behavioral finance researcher. Your role is to analyze investor sentiment, irrational market behaviors, fear/greed cycles, and cognitive biases. You generate sentiment-influenced features and psychological state indicators based on price and volume movement."
]

In [4]:
#Way to structure output
structure = "All output should be accurate and structured in a machine-learning-ready format, ideally as a numerical value or a one word response."

In [5]:
#Questions to ask GPT
prompts = [
    "What would an analyst most likely say that I should do with my stock on the given date! Please limit the response to one of: Buy, Sell, Hold",
    "Would a random person on the internet think that the stock behaved rationally on the given date, please limit the response to one of: Yes, No, Maybe",
    "Does the company that this stock respresent have anything significant happen to it's reputation on the given date that would make me want to buy stock, limit response to one of: Yes, No",
    "How likely would I be to buy the stock based on what I know this particular day, limit response to a number between 0 and 10",
]

## Functions

In [6]:
#Connects to the Gpt turbo Model
def chat_with_gpt(messages, model="gpt-3.5-turbo", temperature=0.7):
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response

In [7]:
#Builds a prompt string for GPT from various parts
def get_prompt_string(context, date, stock, prompt):
    return context + "The given date is: " + date + " and you are analysing my " + stock + " stock." + structure + prompt    

In [8]:
#Wraps everything and returns a response
def chat_wrapper(text):
    messages=[{"role": "user", "content": text}]
    response = chat_with_gpt(messages, "gpt-3.5-turbo", random.uniform(0.5, 0.9))
    return response.choices[0].message.content

In [9]:
#Gets all the data for a particular stock
def get_stock_data(stock):
    client = RESTClient("OHBj3QiypFoLQMXTpx3GPq1R4bxzBKAs")
    aggs = []
    for a in client.list_aggs(stock, 1, "day", "2024-01-01", "2025-01-01", limit=500):
        aggs.append(a)
    df = pd.DataFrame(aggs)
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
    df['1'] = df.apply(lambda row: chat_wrapper(get_prompt_string(random.choice(context), row['datetime'].strftime('%Y-%m-%d'), stock, prompts[0])), axis = 1)
    df['2'] = df.apply(lambda row: chat_wrapper(get_prompt_string(random.choice(context), row['datetime'].strftime('%Y-%m-%d'), stock, prompts[1])), axis = 1)
    df['3'] = df.apply(lambda row: chat_wrapper(get_prompt_string(random.choice(context), row['datetime'].strftime('%Y-%m-%d'), stock, prompts[2])), axis = 1)
    df['4'] = df.apply(lambda row: chat_wrapper(get_prompt_string(random.choice(context), row['datetime'].strftime('%Y-%m-%d'), stock, prompts[3])), axis = 1)
    return df

In [10]:
#Combine all the data and save to csv
def save_data():
    res = get_stock_data(stocks[0])
    for stock in stocks[1:]:
        df = get_stock_data(stock)
        res = pd.concat([res, df], ignore_index=True)
    res.to_csv('data.csv', index=False, sep=',', encoding='utf-8')

## Run the actual function

In [11]:
save_data()