In [6]:
#%%writefile __init__.py
###### -- Python Script in Jupyter to access, create and test financial data and models
###### -- By Ahmed Asiliskender, initial write date 25 June 2024
###### -- May also access MATLAB scripts through here and .py files.

### Here we initialise important libraries and variables.

## To download packages using pip
import sys #! allows to use command terminal code in here
#!{sys.executable} --version
#!pip install html5lib
#!pip install bs4
#!pip install yfinance
#!pip install tradingview-scraper
#!pip install --upgrade --no-cache tradingview-scraper
#!pip install selenium
#!pip install sqlalchemy
#!pip install python-dotenv
#!pip install pandas-ta
#!pip install pytest
#!pip install python-on-whales
# Security testing
#!pip install bandit

# Cmd terminal environment install (psycopg2)
#!pip install psycopg2-binary 
# Conda environment install
#!conda install -c anaconda psycopg2 



# Import pandas (python data analysis lib) and data analysis packages
%matplotlib inline
import numpy as np
from scipy import stats
import pandas as pd
import pandas_ta as ta
import matplotlib.pyplot as plt
import statsmodels.api as sm


#user_header = {'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
#                                AppleWebKit/537.36 (KHTML, like Gecko) \
#                                Chrome/122.0.0.0 Safari/537.36'}

# Webscrape libs
from bs4 import BeautifulSoup
import yfinance as yf
import tradingview_scraper as tvs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tradingview_scraper.symbols.ideas import Ideas

# Other libs (system, graphical or time-compute analysis)
import os
from io import StringIO
import psycopg2
from dotenv import load_dotenv, find_dotenv
from dotenv.main import set_key
from sqlalchemy import create_engine, text
from sqlalchemy import types as sqltype
import sqlalchemy.exc as sqlexc
from colorama import Fore, Back, Style
import copy
from pathlib import Path
import subprocess
import time
from datetime import datetime
import requests
from ast import literal_eval
import json
import unittest
import pytest

#import warnings


# My libs
#from arcanequant.quantlib import *

from arcanequant.quantlib import DataManifest
from arcanequant.quantlib import DownloadIntraday # NEED TO MOVE THIS TO DATA MANAGER
from arcanequant.quantlib import SetKeysQuery, DropKeysQuery, ExecuteSQL

# Paid APIs, (not used, left here)

# Bloomberg (not free)
#!pip install blpapi --index-url=https://blpapi.bloomberg.com/repository/releases/python/simple/ blpapi
#!pip install xbbg
#from xbbg import blp
#blp.bdh( tickers='SPX Index', flds=['High', 'Low', 'Last_Price'], start_date='2018-10-10', end_date='2018-10-20')
#blp.bdp('AAPL US Equity', 'Eqy_Weighted_Avg_Px', VWAP_Dt='20181224')
#blp.bdp(tickers='NVDA US Equity', flds=['Security_Name', 'GICS_Sector_Name'])


### RAPID API FOR INTRADAYS (not free)
#import http.client


In [7]:

#conn = http.client.HTTPSConnection("yahoo-finance127.p.rapidapi.com")

#headers = {
#    'x-rapidapi-key': "2e7bf1e71cmsh8f7a5babc8f5197p1f02bejsn72107f046a86",
#    'x-rapidapi-host': "yahoo-finance127.p.rapidapi.com"
#}

#conn.request("GET", "/finance-analytics/nvda", headers=headers)

#res = conn.getresponse()
#data = res.read()

#print(data.decode("utf-8"))

In [8]:
#%%writefile main.py

def main():
    print('main')

if __name__ == "__main__":
    main()



main


In [9]:
## Here we create settings for the database building/development:
# - Tickers to download
# - Months of intraday data to request
# - Time intervals (granularity/resolution)
# - API request key (taken from file)

savepath = r'data/StockHistData/'

# Tickers to request, make sure it is the correct one (there is a search API call to check)
dltickers = ["GOOG"]

# Months to request (string, "year-month") i.e. 2020-02
dlmonths = ["2025-06"] # Need to add functionality to take whole years

# Options for time resolution: 1, 5, 15, 30, 60
dlintervals = [1]

##### Alphavantage API key for data acquisition
# .env file method
env_path = Path(".") / "APIkey.env" # Environment variables file must be same folder as this code (#REGEXSEARCHFOR API IF NO FILE)
load_dotenv(dotenv_path=env_path)
alphaAPIkey = os.getenv("ALPHA_API_KEY") #MAYBE TRY .TXT IF NO .ENV
# .txt file method
#with open('APIkey.txt',) as keyfile:
#    alphaAPIkey = keyfile.read()


##### Setting environment variables for the SQL login details
SQLloginfilename = "SQLlogin" # Default filename "SQLlogin" 
#SQLdetails_path = Path(".") / f"{SQLloginfilename}.env"
#set_key(SQLdetails_path, 'DRIVER', 'hello') #dotenv.set_key
#set_key(SQLdetails_path, 'DIALECT', 'mate')
#set_key(SQLdetails_path, 'ENV_USER', 'put')
#set_key(SQLdetails_path, 'PASSWORD', 'your')
#set_key(SQLdetails_path, 'HOST_MACHINE', 'own')
#set_key(SQLdetails_path, 'PORT', 'stuff')
#set_key(SQLdetails_path, 'DBNAME', 'here')


checkManifest = DataManifest()
checkManifest.loadManifest(loadFrom = 'direct', path = savepath)
checkManifest.connectSQL(SQLloginfilename)

##### Sourcing market data
DownloadIntraday(savepath, dltickers, dlintervals, dlmonths, alphaAPIkey, saveMode='both', verbose=True) # TODO: ADD SAVING TO BOTH STORAGE

#checkManifest = DataManifest()
#checkManifest.loadManifest(path = savepath)
#checkManifest.connectSQL(SQLloginfilename)

#checkManifest.validateManifest(fastValidate = False, show = False)
#print(checkManifest.DF) # Maybe I can add method to show manifest in a more compact form
#checkManifest.saveManifest(saveTo='direct',savePath=checkManifest.directory)

# ADD CODE TO CONVERT SQL TABLE TO THE DF FORM (MULTIINDEX)c

# PSEUDOCODE FOR MANIFEST:
# DROP MANIFESTID
# MAKE MULTIINDEX ONCE AGAIN FROM STOCKS, INTERVAL COLUMNS
# RENAME COLUMN NAME TO MONTH


Data Manifest Initialised
Loading Manifest Data
Load path/name: data/StockHistData/dataManifest.json
Connecting to engine: Engine(postgresql+psycopg2://postgres:***@localhost:5432/marketdata)


TypeError: DownloadIntraday() got multiple values for argument 'saveMode'

In [None]:
import pytest
from python_on_whales import docker
import time


# Run a container
#docker.run("hello-world")

# Pull an image
#docker.pull("postgres:15")


@pytest.fixture(scope="session")
def test_postgres():
    container = docker.run(
        "postgres:15",
        detach=True,
        name="test_pg",
        envs={"POSTGRES_PASSWORD": "test", "POSTGRES_USER": "test", "POSTGRES_DB": "testdb"},
        publish=["5433:5432"]
    )
    time.sleep(3)  # wait for the DB to boot

    yield "postgresql://test:test@localhost:5433/testdb"

    docker.container.remove("test_pg", force=True)

In [None]:
#%%writefile DockerInstallWrapper.py
# Docker Installation Process
import os

def install_docker():
    import subprocess
    import shutil

    docker_path = shutil.which("docker")
    if docker_path:
        print(f"Docker already installed, located at: {docker_path}")
        return
    else:
        print("Docker not found in PATH")

    # Windows installation procedure
    if os.name == "nt":
        dockerInstaller_path = shutil.which("dockerinstaller")    
        if dockerInstaller_path:
            print(f"Docker installer found at: {dockerInstaller_path}")
        else:
            print("Downloading docker installer...")
            subprocess.run(["powershell", "-Command", "Invoke-WebRequest -Uri 'https://desktop.docker.com/win/main/amd64/Docker Desktop Installer.exe' -OutFile 'DockerInstaller.exe'"], shell=True)
        
        quietChoice = input("Do you want to install quietly (without GUI, automatically)? (y/n): ").strip().lower()
        if quietChoice == 'y':
            print("Installing docker quietly...")
            subprocess.run(["powershell", "-Command", 'Start-Process -FilePath "DockerInstaller.exe" -ArgumentList "install --quiet" -Verb RunAs -Wait'], shell=True)
        else:
            print("Installing docker...")
            subprocess.run(["powershell", "-Command", 'Start-Process -FilePath "DockerInstaller.exe" -ArgumentList "install" -Verb RunAs -Wait'], shell=True)

    # Linux/macOS installation procedure
    elif os.name == "posix":
        print("Installing docker...")
        subprocess.run("curl -fsSL https://get.docker.com | sh", shell=True)
        subprocess.run("sudo usermod -aG docker $USER", shell=True)

    prompt_restart()

def prompt_restart():
    choice = input("Docker installation complete. Do you want to restart now? (y/n): ").strip().lower()
    if choice == 'y':
        force_restart()
    else:
        print("Restart skipped. You may need to restart manually for changes to take full effect.")

def force_restart():
    print("Restarting device.")
    if os.name == "nt":  # Windows
        subprocess.run(["powershell", "Restart-Computer -Force"], shell=True)
    elif os.name == "posix":  # Linux/macOS
        subprocess.run("sudo shutdown -r now", shell=True)

if __name__ == "__main__":
    installChoice = input("Do you want to install Docker to gain access to SQL functionality? (y/n): ").strip().lower()
    
    if installChoice == 'y':
        install_docker()
    else:
        print("Install skipped. You need to install docker to be able to use SQL functionality.")


In [None]:
# TEST SQL SYNC
# TEST SQL SAVE
import pandas as pd
### INITIAL STATE
x = pd.read_csv(r"C:\Users\a_asi\FinanceProject\ArcaneQuant\data\StockHistData\MSFT\MSFT_1_2022-01_meta.csv", index_col = 0) #index_col = 0, 
print('========================= CSV FORM =========================')
print(x)

### CONVERTING TO SQL FORM
y=x
y.rename(index={'2. Symbol':'Ticker', '3. Last Refreshed':'DateTime','4. Interval':'Interval'},inplace=True)
y.loc['Interval'] = 1



y.loc['7. Month'] = '2022-01'
y = y.T
print('========================= INPUTTABLE SQL FORM =========================')
# This gets put into SQLSave, but I have to add code to SQLSave that if saving to metaTable, convert name of Ticker, Interval to 2. Symbol, 4. Interval
# , and also create a view combining DateID/TimeID and call it 3. Last Refreshed. This way, when loading, just call the view with the month, ticker, interval filter, and drop month column to get original .csv format
print(y) 
### NOTE: DONT NEED TO MAKE MULTIINDEX, 


# CHANGING SQL ACCEPTABLE TO META
xx = y
yy = xx.reset_index()

# CHANGE ROW TO META DATA AND TRANSPOSE
zz = yy
zz.drop(columns=['7. Month'], inplace=True)
zz = zz.T
zz.loc['Interval'] = zz.loc['Interval'].to_string()+'min'
zz.rename(index={'Ticker':'2. Symbol', 'DateTime':'3. Last Refreshed','Interval':'4. Interval'},inplace=True)
zz.rename(columns={0:"Meta Data"},inplace=True)
print('========================= REVERTED =========================')
print(zz.sort_index())

                                                           Meta Data
1. Information     Intraday (1min) open, high, low, close prices ...
2. Symbol                                                       MSFT
3. Last Refreshed                                2022-01-31 19:58:00
4. Interval                                                     1min
5. Output Size                                             Full size
6. Time Zone                                              US/Eastern
                                              1. Information Ticker  \
Meta Data  Intraday (1min) open, high, low, close prices ...   MSFT   

                      DateTime Interval 5. Output Size 6. Time Zone 7. Month  
Meta Data  2022-01-31 19:58:00        1      Full size   US/Eastern  2022-01  
                                                           Meta Data
1. Information     Intraday (1min) open, high, low, close prices ...
2. Symbol                                                       MSFT
3. Last R

In [1]:
from time import time
#from quantlib import SQLSave
# Testing manifest saving
savepath = r'data/StockHistData/'
SQLloginfilename = "SQLlogin" # Default filename "SQLlogin" 

checkManifest = DataManifest()
checkManifest.loadManifest(loadFrom = 'direct', path = savepath)
checkManifest.validateManifest()
checkManifest.saveManifest()
#checkManifest.connectSQL(SQLloginfilename)
# Saving market data from csv needs tagging of Ticker and Interval
df1 = checkManifest.loadData_fromcsv('KO',5,'2024-04')
df1['Ticker'] = 'KO'
df1['Interval'] = 5

SQLSave(df1, checkManifest.SQLengine, 'marketTable', echo = True)


#SQLEstablish(checkManifest.SQLengine)
#SQLRepair(dataManifest = checkManifest)

t0 = time()
# Testing manifest loading
print('manifest/DM')
zz = ExtractData(checkManifest,  'manifest',  start = 'all', end = '2022-11', fromSQL = False, condition = None)
print(zz)
t1 = time()
print(f"Total time elapsed : {t1-t0} seconds")

print('manifest/SQL')
zz2 = ExtractData(checkManifest, 'manifest',  start = '2000-01', end = '2000-05', fromSQL = True, condition = None)
print(zz2)
t2 = time()
print(f"Total time elapsed : {t2-t1} seconds")

print('market/DM')
# Testing stock data loading
xx = ExtractData(checkManifest, 'market',  start = 'all', end = '2022-11', fromSQL = False, condition = None)
print(xx)
t3 = time()
print(f"Total time elapsed : {t3-t2} seconds")

print('manifest/SQL')
yy = ExtractData(checkManifest, 'market',  start = 'all', end = '2022-11', fromSQL = True, condition = None)
print(yy)
t4 = time()
print(f"Total time elapsed : {t4-t3} seconds")

# Testing stock data saving
#SQLSave(xx, checkManifest.SQLengine, 'marketTable', echo = True)



NameError: name 'DataManifest' is not defined

In [None]:
import pandas as pd
from arcanequant.quantlib import *

savepath = r'data/StockHistData/'
SQLloginfilename = "SQLlogin" # Default filename "SQLlogin" 

checkManifest = DataManifest()
checkManifest.connectSQL(SQLloginfilename)
checkManifest.loadManifest(loadFrom='direct', path = savepath)





Data Manifest Initialised
Connecting to engine: Engine(postgresql+psycopg2://postgres:***@localhost:5432/marketdata)
Loading Manifest Data
Load path/name: data/StockHistData/dataManifest.json
load csv
Loading data file: KO_15_2022-01.csv
Loading meta data file: KO_15_2022-01_meta.csv
load sql
Extracting data from SQL database
Direct Test: True, True (meta)
load DF/SQLDF
Loading Manifest Data
Load path/name: data/StockHistData/dataManifest.json
load metaDF/metaSQLDF
Loading Manifest Data
Load path/name: data/StockHistData/dataManifest.json
Inputted DataFrame has no ticker, interval and/or month column. From .csv or already processed. Fixing index and returning input.
ExtractDF Test: True, True (meta)


  return df[condition(df)]


In [4]:
print(checkManifest.DF[checkManifest.DF['2022-01']==0])

Month            2022-01  2022-02  2022-03  2022-04  2022-05  2022-06  \
Ticker Interval                                                         
ARM    1               0        0        0        0        0        0   
       5               0        0        0        0        0        0   
       15              0        0        0        0        0        0   
       30              0        0        0        0        0        0   
       60              0        0        0        0        0        0   

Month            2022-07  2022-08  2022-09  2022-10  ...  2024-09  2024-10  \
Ticker Interval                                      ...                     
ARM    1               0        0        0        0  ...        1        1   
       5               0        0        0        0  ...        1        1   
       15              0        0        0        0  ...        1        1   
       30              0        0        0        0  ...        1        1   
       60           

In [None]:
from arcanequant.quantlib import *


# Testing manifest load
savepath = r'data/StockHistData/'
SQLloginfilename = "SQLlogin" # Default filename "SQLlogin" 

checkManifest = DataManifest()
checkManifest.connectSQL(SQLloginfilename)
checkManifest.loadManifest(loadFrom='database', path = savepath)
print(checkManifest.DF)
checkManifest.loadManifest(loadFrom='direct', path = savepath)


Data Manifest Initialised
Connecting to engine: Engine(postgresql+psycopg2://postgres:***@localhost:5432/marketdata)
Loading Manifest Data
Loading manifest view from engine: Engine(postgresql+psycopg2://postgres:***@localhost:5432/marketdata)
                 2022-01  2022-02  2022-03  2022-04  2022-05  2022-06  \
Ticker Interval                                                         
ARM    1               0        0        0        0        0        0   
       5               0        0        0        0        0        0   
       15              0        0        0        0        0        0   
       30              0        0        0        0        0        0   
       60              0        0        0        0        0        0   
GOOG   1               1        1        1        1        1        1   
       5               1        1        1        1        1        1   
       15              1        1        1        1        1        1   
       30              1   

In [None]:

print(checkManifest.DF)

In [None]:
## Here we read and post-process the .csv data and do a time-series analysis (rolling mean and std. deviation)
# Intention is to figure out a good window size by assessing the variance and if it follows a chi-sq. distribution
# The assumption here is the stock values are normally distributed
# The issue is the stock mean/variance may change over time, how do I account for this?

#### TO DO:
# We are supposed to assess the % change of stock values
# I will calculate this, and also its variance, I will do this for different resolutions and in a rolling window (I am thinking of sizes, 5, 10, 20, 50, 100)
# Should I calculate a global (or yearly or monthly) variance and compare this to the window to see if theres a match for specific sizes (which correspond to same timeframe)?
# Once I do this I can really go into the trade models I made previously

##### THOUGHTS:
# Is there a point in hypothesis testing a 'true' variance or mean for growth stocks? perhaps other than to test for a non-constant mean model?
# There is point in hypothesis testing dividend stocks/ETFs (i.e. FTSE 100 Vanguard ETF)
# Is it reasonable to model the mean (or exp. stock value) as fct of time in context of a fourier transform
# (as many growth+dividend stocks have cyclical behaviour around quarterlies, typically goes up then down later) 





data_MSFT_test = checkManifest.loadData_fromcsv("MSFT", 15, "2024-08")
data_MSFT_test.set_index(pd.to_datetime(data_MSFT_test['DateTime']), inplace=True)


############
############
# Modelling prices:
# Get nominal price for each time, use to get percent change from last instance
# Convert to histogram/empirical distribution and model the distribution

# TO DO:
# OBTAIN HISTOGRAM
# I EXPECT OUTLIERS TO PROVIDE INFORMATION (PERHAPS W.R.T. INTER-DAY PRICE OR HIGHER VOLATILITY AT TRADING DAY START/END)

# THE BIN RESOLUTION FOR HISTOGRAMS ARE IMPORTANT IN ASSESSING HOW CLOSELY THE DISTRIBUTION MATCHES THE DATA, MAYBE.
# IF TOO COARSE, THE BIN CENTRE WILL BE TOO HIGH, OTHERWISE IT WILL BE TOO FLAT (AND UNEVEN)
# IS THE BIN REPRESENTATION A BETTER ASSESSMENT OF THE RESIDUAL OF THE DATA <-> DISTRIBUTION FIT, OR CAN THE RAW DATA BE COMPARED DIRECTLY TO THE DISTRIBUTION?
# THE LATTER SOUNDS LIKE IT DOESNT MAKE SENSE AS THE PDF IS TO DO WITH "PROBABILITY DENSITY" SO WE MUST USE BINS TO COLLECT REGIONS IN THE PRICE (X) DOMAIN
# IF SO I MUST STUDY WHAT BIN SIZE IS OPTIMAL FOR RESIDUAL, IF THE OPTIMAL BIN SIZE CHANGES FOR DIFFERENT DISTRIBUTIONS (PROBABLY YES) OR FOR OUTLIER EFFECTS (ALSO YES)

# TO CONSIDER:
# MODELLING EACH TIME INSTANCE WITH ITS OWN LIMITED-DOMAIN DISTRIBUTION BASED ON MAX-MIN PRICE AND VOLUME
# PRESUME EACH TRADE IS A RANDOM VARIABLE FOLLOWING (WHICH??) DISTRIBUTION
# THEREFORE THE COLLECTIVE VOLUME FOLLOWS (WHICH? T-DISTRI.?) DISTRIBUTION

# Obtain nominal price as mean of high and low prices (as we have no idea where the mean trade of each time instance may be so our most 'accurate' is probably the center of the limits
# Note: can possibly assume a finite domain bell distribution for the nominal value
data_MSFT_test['Nominal'] = (data_MSFT_test['High'] + data_MSFT_test['Low']) /2
data_MSFT_test['PctChange_N'] = data_MSFT_test['Nominal'].diff()/data_MSFT_test['Nominal']*100
#data_MSFT_test['PctChange_H'] = data_MSFT_test['High'].diff()/data_MSFT_test['High']*100
#data_MSFT_test['PctChange_L'] = data_MSFT_test['Low'].diff()/data_MSFT_test['Low']*100


# Bin count for histogram
bincount_list = [5,10,20,40,80,160,320] # Note if too high, worse bell-curve representation
# Interval to exclude outliers from
exclusion_int = 0.01
# Exclude outlier (in display)
no_outliers = True

# [min, max] ranges of the histogram
rangemin = data_MSFT_test['PctChange_N'].min()
rangemax = data_MSFT_test['PctChange_N'].max()
# Exclusive interval range for histogram (excludes outliers)
excl_min = data_MSFT_test['PctChange_N'].quantile(exclusion_int)
excl_max = data_MSFT_test['PctChange_N'].quantile(1-exclusion_int)

# Outlier data filtering
data_filtered = data_MSFT_test[(data_MSFT_test['PctChange_N'] < excl_max) & (data_MSFT_test['PctChange_N'] > excl_min)]

xmin = None
xmax = None
if no_outliers:
    xmin = excl_min
    xmax = excl_max
else:
    xmin = rangemin
    xmax = rangemax

# Bin sequence list for dataset histogram (for the range of bincounts)
binseq = []
for bincount in bincount_list:
    binseq += [[xmin + i*(xmax-xmin)/bincount for i in range(bincount+1)]] # Can truncate for efficiency

# CREATE MODEL TO FIT/SHOW ON PLOT AND ASSESS ACCURACY
# CREATE CURVE AND EVALUATE CURVE VALUE ON HIST LOCATION, GET ITS RESIDUAL
# JUST USE STATS TO FIND WHAT RESIDUAL WOULD BE FOR CURVES GIVEN DATAPOINTS
# TEST CONFIDENCE INTERVALS?
# WHAT ABOUT KERNEL DENSITY ESTIMATION?

# Distribution testing
mean_out = data_MSFT_test['PctChange_N'].mean()
std_out = data_MSFT_test['PctChange_N'].std()
mean_nout = data_filtered['PctChange_N'].mean()
std_nout = data_filtered['PctChange_N'].std()

pdf = stats.norm.pdf(data_MSFT_test['PctChange_N'].sort_values(), mean_out, std_out) # Values sorted as iterated through index
pdf2 = stats.norm.pdf(data_MSFT_test['PctChange_N'].sort_values(), mean_nout, std_nout) # Values sorted as iterated through index
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf, label = 'Inc. Outliers')
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf2, label = 'Excl. Outliers')
plt.xlim([xmin, xmax])



# Histogram (density) plot
fig, ax = plt.subplots(figsize=(12, 6))
for bins in binseq:
    data_MSFT_test['PctChange_N'].hist(density = True, bins=bins)
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf, label = 'Inc. Outliers')
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf2, label = 'Excl. Outliers')
plt.xlim([xmin, xmax])

# Histogram - Residual relation
# CALCULATE CENTER OF BIN, ASSESS VALUE OF PDF AT THAT LOCATION, (X-Y)^2



# Plot of percentage change vs. time
plt.title('Microsoft Data Test Plot')
plt.figure()
fig, ax = plt.subplots(figsize=(12, 6))
plt.title('PctChange_N')
plt.plot(data_MSFT_test['PctChange_N'], label='PctChange_N', marker = 'o',markersize=3,linestyle='dashed')





cols_to_plot = ['Open', 'Close', 'SMA', 'Standard Deviation (O, 10)']

data_MSFT_test['SMA'] = data_MSFT_test['Open'].rolling(10).mean()
data_MSFT_test['SMAC'] = data_MSFT_test['Close'].rolling(10).mean()
data_MSFT_test['Standard Deviation (O, 10)'] = data_MSFT_test['Open'].rolling(10).std()
data_MSFT_test['Standard Deviation (O, 5)'] = data_MSFT_test['Open'].rolling(5).std()
data_MSFT_test['Standard Deviation (O, 3)'] = data_MSFT_test['Open'].rolling(3).std()
data_MSFT_test['Standard Deviation (O, 20)'] = data_MSFT_test['Open'].rolling(20).std()
data_MSFT_test['Standard Deviation (O, 40)'] = data_MSFT_test['Open'].rolling(40).std()
data_MSFT_test['Standard Deviation (C, 10)'] = data_MSFT_test['Close'].rolling(10).std()
# TO DO: MAKE HISTOGRAM OF STD.D.

## PLOTTING DATA
data_MSFT_test[cols_to_plot].plot()
plt.title('Microsoft Data Test Plot')

plt.figure()
plt.title('Volume')
plt.plot(data_MSFT_test['Volume'], label='Volume')

fig, ax = plt.subplots(figsize=(12, 6))
plt.title('SMAO/C')
plt.plot(data_MSFT_test['SMA'], label='SMA (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['SMAC'], label='SMA (Close)', marker = 'x',markersize=3,linestyle='dashed')

fig, ax = plt.subplots(figsize=(12, 6))
plt.title('Standard Deviation')
plt.plot(data_MSFT_test['Standard Deviation (O, 10)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (C, 10)'], label='Standard Deviation (Close)', marker = 'o',markersize=3,linestyle='dashed')

std_dev_cols = ['Standard Deviation (O, 3)', 'Standard Deviation (O, 5)', 'Standard Deviation (O, 10)', 'Standard Deviation (O, 20)', 'Standard Deviation (O, 40)']

fig, ax = plt.subplots(figsize=(12, 6))
plt.title('Standard Deviation')
plt.plot(data_MSFT_test['Standard Deviation (O, 3)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 5)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 10)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 20)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 40)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')

plt.title('Standard Deviation')
data_MSFT_test[std_dev_cols].plot()

## Here we also do a auto-correlation test
#fig, ax = plt.subplots(figsize=(12, 6))
#sm.graphics.tsa.plot_acf(data_MSFT_test['Close'], lags=195, ax=ax)
#plt.title('Autocorrelation Function (ACF)')
#plt.xlabel('Lags')
#plt.ylabel('ACF')
plt.show()


In [None]:

#### TO DO:
# We are supposed to assess the % change of stock values
# I will calculate this, and also its variance, I will do this for different resolutions and in a rolling window (I am thinking of sizes, 5, 10, 20, 50, 100)
# Should I calculate a global (or yearly or monthly) variance and compare this to the window to see if theres a match for specific sizes (which correspond to same timeframe)?
# Once I do this I can really go into the trade models I made previously

##### THOUGHTS:
# Is there a point in hypothesis testing a 'true' variance or mean for growth stocks? perhaps other than to test for a non-constant mean model?
# There is point in hypothesis testing dividend stocks/ETFs (i.e. FTSE 100 Vanguard ETF)
# Is it reasonable to model the mean (or exp. stock value) as fct of time in context of a fourier transform
# (as many growth+dividend stocks have cyclical behaviour around quarterlies, typically goes up then down later) 





data_MSFT_test = checkManifest.loadData_fromcsv("MSFT", 15, "2024-08")
data_MSFT_test.set_index(pd.to_datetime(data_MSFT_test['DateTime']), inplace=True)


############
############
# Modelling prices:
# Get nominal price for each time, use to get percent change from last instance
# Convert to histogram/empirical distribution and model the distribution

# TO DO:
# OBTAIN HISTOGRAM
# I EXPECT OUTLIERS TO PROVIDE INFORMATION (PERHAPS W.R.T. INTER-DAY PRICE OR HIGHER VOLATILITY AT TRADING DAY START/END)

# THE BIN RESOLUTION FOR HISTOGRAMS ARE IMPORTANT IN ASSESSING HOW CLOSELY THE DISTRIBUTION MATCHES THE DATA, MAYBE.
# IF TOO COARSE, THE BIN CENTRE WILL BE TOO HIGH, OTHERWISE IT WILL BE TOO FLAT (AND UNEVEN)
# IS THE BIN REPRESENTATION A BETTER ASSESSMENT OF THE RESIDUAL OF THE DATA <-> DISTRIBUTION FIT, OR CAN THE RAW DATA BE COMPARED DIRECTLY TO THE DISTRIBUTION?
# THE LATTER SOUNDS LIKE IT DOESNT MAKE SENSE AS THE PDF IS TO DO WITH "PROBABILITY DENSITY" SO WE MUST USE BINS TO COLLECT REGIONS IN THE PRICE (X) DOMAIN
# IF SO I MUST STUDY WHAT BIN SIZE IS OPTIMAL FOR RESIDUAL, IF THE OPTIMAL BIN SIZE CHANGES FOR DIFFERENT DISTRIBUTIONS (PROBABLY YES) OR FOR OUTLIER EFFECTS (ALSO YES)

# TO CONSIDER:
# MODELLING EACH TIME INSTANCE WITH ITS OWN LIMITED-DOMAIN DISTRIBUTION BASED ON MAX-MIN PRICE AND VOLUME
# PRESUME EACH TRADE IS A RANDOM VARIABLE FOLLOWING (WHICH??) DISTRIBUTION
# THEREFORE THE COLLECTIVE VOLUME FOLLOWS (WHICH? T-DISTRI.?) DISTRIBUTION

# Obtain nominal price as mean of high and low prices (as we have no idea where the mean trade of each time instance may be so our most 'accurate' is probably the center of the limits
# Note: can possibly assume a finite domain bell distribution for the nominal value
data_MSFT_test['Nominal'] = (data_MSFT_test['High'] + data_MSFT_test['Low']) /2
data_MSFT_test['PctChange_N'] = data_MSFT_test['Nominal'].diff()/data_MSFT_test['Nominal']*100
#data_MSFT_test['PctChange_H'] = data_MSFT_test['High'].diff()/data_MSFT_test['High']*100
#data_MSFT_test['PctChange_L'] = data_MSFT_test['Low'].diff()/data_MSFT_test['Low']*100


# Bin count for histogram
bincount_list = [5,10,20,40,80,160,320] # Note if too high, worse bell-curve representation
# Interval to exclude outliers from
exclusion_int = 0.01
# Exclude outlier (in display)
no_outliers = True

# [min, max] ranges of the histogram
rangemin = data_MSFT_test['PctChange_N'].min()
rangemax = data_MSFT_test['PctChange_N'].max()
# Exclusive interval range for histogram (excludes outliers)
excl_min = data_MSFT_test['PctChange_N'].quantile(exclusion_int)
excl_max = data_MSFT_test['PctChange_N'].quantile(1-exclusion_int)

# Outlier data filtering
data_filtered = data_MSFT_test[(data_MSFT_test['PctChange_N'] < excl_max) & (data_MSFT_test['PctChange_N'] > excl_min)]

xmin = None
xmax = None
if no_outliers:
    xmin = excl_min
    xmax = excl_max
else:
    xmin = rangemin
    xmax = rangemax

# Bin sequence list for dataset histogram (for the range of bincounts)
binseq = []
for bincount in bincount_list:
    binseq += [[xmin + i*(xmax-xmin)/bincount for i in range(bincount+1)]] # Can truncate for efficiency

# CREATE MODEL TO FIT/SHOW ON PLOT AND ASSESS ACCURACY
# CREATE CURVE AND EVALUATE CURVE VALUE ON HIST LOCATION, GET ITS RESIDUAL
# JUST USE STATS TO FIND WHAT RESIDUAL WOULD BE FOR CURVES GIVEN DATAPOINTS
# TEST CONFIDENCE INTERVALS?
# WHAT ABOUT KERNEL DENSITY ESTIMATION?

# Distribution testing
mean_out = data_MSFT_test['PctChange_N'].mean()
std_out = data_MSFT_test['PctChange_N'].std()
mean_nout = data_filtered['PctChange_N'].mean()
std_nout = data_filtered['PctChange_N'].std()

pdf = stats.norm.pdf(data_MSFT_test['PctChange_N'].sort_values(), mean_out, std_out) # Values sorted as iterated through index
pdf2 = stats.norm.pdf(data_MSFT_test['PctChange_N'].sort_values(), mean_nout, std_nout) # Values sorted as iterated through index
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf, label = 'Inc. Outliers')
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf2, label = 'Excl. Outliers')
plt.xlim([xmin, xmax])



# Histogram (density) plot
fig, ax = plt.subplots(figsize=(12, 6))
for bins in binseq:
    data_MSFT_test['PctChange_N'].hist(density = True, bins=bins)
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf, label = 'Inc. Outliers')
plt.plot(data_MSFT_test['PctChange_N'].sort_values(), pdf2, label = 'Excl. Outliers')
plt.xlim([xmin, xmax])

# Histogram - Residual relation
# CALCULATE CENTER OF BIN, ASSESS VALUE OF PDF AT THAT LOCATION, (X-Y)^2



# Plot of percentage change vs. time
plt.title('Microsoft Data Test Plot')
plt.figure()
fig, ax = plt.subplots(figsize=(12, 6))
plt.title('PctChange_N')
plt.plot(data_MSFT_test['PctChange_N'], label='PctChange_N', marker = 'o',markersize=3,linestyle='dashed')





cols_to_plot = ['Open', 'Close', 'SMA', 'Standard Deviation (O, 10)']

data_MSFT_test['SMA'] = data_MSFT_test['Open'].rolling(10).mean()
data_MSFT_test['SMAC'] = data_MSFT_test['Close'].rolling(10).mean()
data_MSFT_test['Standard Deviation (O, 10)'] = data_MSFT_test['Open'].rolling(10).std()
data_MSFT_test['Standard Deviation (O, 5)'] = data_MSFT_test['Open'].rolling(5).std()
data_MSFT_test['Standard Deviation (O, 3)'] = data_MSFT_test['Open'].rolling(3).std()
data_MSFT_test['Standard Deviation (O, 20)'] = data_MSFT_test['Open'].rolling(20).std()
data_MSFT_test['Standard Deviation (O, 40)'] = data_MSFT_test['Open'].rolling(40).std()
data_MSFT_test['Standard Deviation (C, 10)'] = data_MSFT_test['Close'].rolling(10).std()
# TO DO: MAKE HISTOGRAM OF STD.D.

## PLOTTING DATA
data_MSFT_test[cols_to_plot].plot()
plt.title('Microsoft Data Test Plot')

plt.figure()
plt.title('Volume')
plt.plot(data_MSFT_test['Volume'], label='Volume')

fig, ax = plt.subplots(figsize=(12, 6))
plt.title('SMAO/C')
plt.plot(data_MSFT_test['SMA'], label='SMA (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['SMAC'], label='SMA (Close)', marker = 'x',markersize=3,linestyle='dashed')

fig, ax = plt.subplots(figsize=(12, 6))
plt.title('Standard Deviation')
plt.plot(data_MSFT_test['Standard Deviation (O, 10)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (C, 10)'], label='Standard Deviation (Close)', marker = 'o',markersize=3,linestyle='dashed')

std_dev_cols = ['Standard Deviation (O, 3)', 'Standard Deviation (O, 5)', 'Standard Deviation (O, 10)', 'Standard Deviation (O, 20)', 'Standard Deviation (O, 40)']

fig, ax = plt.subplots(figsize=(12, 6))
plt.title('Standard Deviation')
plt.plot(data_MSFT_test['Standard Deviation (O, 3)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 5)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 10)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 20)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')
plt.plot(data_MSFT_test['Standard Deviation (O, 40)'], label='Standard Deviation (Open)', marker = 'o',markersize=3,linestyle='dashed')

plt.title('Standard Deviation')
data_MSFT_test[std_dev_cols].plot()

## Here we also do a auto-correlation test
#fig, ax = plt.subplots(figsize=(12, 6))
#sm.graphics.tsa.plot_acf(data_MSFT_test['Close'], lags=195, ax=ax)
#plt.title('Autocorrelation Function (ACF)')
#plt.xlabel('Lags')
#plt.ylabel('ACF')
plt.show()


In [None]:

## Here we read the .csv data and do a time-series analysis (rolling mean)

data_ARM = pd.read_csv('StockHistData\ARM.csv', index_col='Date', parse_dates=['Date'])

#cols_to_plot = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'SMA', 'Standard Deviation']
cols_to_plot = ['Close', 'SMA', 'Standard Deviation']
print(data_ARM)
data_ARM['SMA'] = data_ARM['Open'].rolling(10).mean()
data_ARM['Standard Deviation'] = data_ARM['Open'].rolling(10).std()

df_ARM = pd.DataFrame(data_ARM)

data_ARM[cols_to_plot].plot()
plt.title('ARM Technologies')

plt.figure()
plt.title('Volume')
plt.plot(data_ARM['Volume'], label='Volume')

## Here we also do a auto-correlation test
fig, ax = plt.subplots(figsize=(12, 6))
sm.graphics.tsa.plot_acf(data_ARM['Close'], lags=195, ax=ax)
plt.title('Autocorrelation Function (ACF)')
plt.xlabel('Lags')
plt.ylabel('ACF')
plt.show()

In [None]:
## Same for MSFT
data_MSFT = pd.read_csv('StockHistData\MSFT.csv', index_col='Date', parse_dates=['Date'])

#cols_to_plot = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'SMA', 'Standard Deviation']
cols_to_plot = ['Close', 'SMA', 'Standard Deviation']

data_MSFT['SMA'] = data_MSFT['Open'].rolling(10).mean()
data_MSFT['Standard Deviation'] = data_MSFT['Open'].rolling(10).std()

df_MSFT = pd.DataFrame(data_MSFT)

data_MSFT[cols_to_plot].plot()
plt.title('Microsoft')

plt.figure()
plt.title('Volume')
plt.plot(data_MSFT['Volume'], label='Volume')

## Here we also do a auto-correlation test
fig, ax = plt.subplots(figsize=(12, 6))
sm.graphics.tsa.plot_acf(data_MSFT['Close'], lags=1257, ax=ax)
plt.title('Autocorrelation Function (ACF)')
plt.xlabel('Lags')
plt.ylabel('ACF')
plt.show()

In [None]:
## Then NVDA
data_NVDA = pd.read_csv('StockHistData\\NVDA.csv', index_col='Date', parse_dates=['Date'])

#cols_to_plot = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'SMA', 'Standard Deviation']
cols_to_plot = ['Close', 'SMA', 'Standard Deviation']

data_NVDA['SMA'] = data_NVDA['Open'].rolling(10).mean()
data_NVDA['Standard Deviation'] = data_NVDA['Open'].rolling(10).std()

df_NVDA = pd.DataFrame(data_NVDA)

data_NVDA[cols_to_plot].plot()
plt.title('Nvidia')

plt.figure()
plt.title('Volume')
plt.plot(data_NVDA['Volume'], label='Volume')

## Here we also do a auto-correlation test
fig, ax = plt.subplots(figsize=(12, 6))
sm.graphics.tsa.plot_acf(data_NVDA['Close'], lags=1257, ax=ax)
plt.title('Autocorrelation Function (ACF)')
plt.xlabel('Lags')
plt.ylabel('ACF')
plt.show()

In [None]:
## Lets try for KO
data_KO = pd.read_csv('StockHistData\\KO.csv', index_col='Date', parse_dates=['Date'])

#cols_to_plot = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'SMA', 'Standard Deviation']
cols_to_plot = ['Close', 'SMA', 'Standard Deviation']

data_KO['SMA'] = data_KO['Open'].rolling(10).mean()
data_KO['Standard Deviation'] = data_KO['Open'].rolling(10).std()

df_KO = pd.DataFrame(data_KO)

data_KO[cols_to_plot].plot()
plt.title('Coca-cola')

plt.figure()
plt.title('Volume')
plt.plot(data_KO['Volume'], label='Volume')

## Here we also do a auto-correlation test
fig, ax = plt.subplots(figsize=(12, 6))
sm.graphics.tsa.plot_acf(data_KO['Close'], lags=1257, ax=ax)
plt.title('Autocorrelation Function (ACF)')
plt.xlabel('Lags')
plt.ylabel('ACF')
plt.show()

In [None]:
# Compute correlations between the different stocks
correlation1 = df_MSFT['Close'][-1257:].corr(df_NVDA['Close'][-1257:])
correlation1k = df_MSFT['Close'][-1257:].corr(df_NVDA['Close'][-1257:], method='kendall')
correlation1s = df_MSFT['Close'][-1257:].corr(df_NVDA['Close'][-1257:], method='spearman')

print(f"Correlation coefficient between MSFT and NVDA: {correlation1} ({correlation1k} with Kendall and {correlation1s} with Spearman)")

correlation2 = df_MSFT['Close'][-195:].corr(df_ARM['Close'][-195:])
correlation2k = df_MSFT['Close'][-195:].corr(df_ARM['Close'][-195:], method='kendall')
correlation2s = df_MSFT['Close'][-195:].corr(df_ARM['Close'][-195:], method='spearman')

print(f"Correlation coefficient between MSFT and ARM: {correlation2} ({correlation2k} with Kendall and {correlation2s} with Spearman)")

correlation3 = df_ARM['Close'][-195:].corr(df_NVDA['Close'][-195:])
correlation3k = df_ARM['Close'][-195:].corr(df_NVDA['Close'][-195:], method='kendall')
correlation3s = df_ARM['Close'][-195:].corr(df_NVDA['Close'][-195:], method='spearman')

print(f"Correlation coefficient between ARM and NVDA: {correlation3} ({correlation3k} with Kendall and {correlation3s} with Spearman)")

correlation4 = df_KO['Close'][-1257:].corr(df_NVDA['Close'][-1257:])
correlation4k = df_KO['Close'][-1257:].corr(df_NVDA['Close'][-1257:], method='kendall')
correlation4s = df_KO['Close'][-1257:].corr(df_NVDA['Close'][-1257:], method='spearman')

print(f"Correlation coefficient between KO and NVDA: {correlation4} ({correlation4k} with Kendall and {correlation4s} with Spearman)")

correlation5 = df_KO['Close'][-1257:].corr(df_MSFT['Close'][-1257:])
correlation5k = df_KO['Close'][-1257:].corr(df_MSFT['Close'][-1257:], method='kendall')
correlation5s = df_KO['Close'][-1257:].corr(df_MSFT['Close'][-1257:], method='spearman')

print(f"Correlation coefficient between ARM and NVDA: {correlation5} ({correlation5k} with Kendall and {correlation5s} with Spearman)")


In [None]:
## Here we do simply Quantitative analysis (mean, std)
import numpy as np

# Sample Data
returns = np.array([0.01, 0.02, -0.01, 0.03, 0.005])

# Calculate Mean and Standard Deviation
mean_return = np.mean(returns)
std_deviation = np.std(returns)
print(f"Mean Return: {mean_return}, Standard Deviation: {std_deviation}")


In [None]:
## Here we have code to do hypothesis testing
from scipy import stats

group_A = [0.01, 0.02, 0.015, 0.023, 0.016]
group_B = [0.02, 0.025, 0.03, 0.019, 0.021]
sig_value = 0.05

## We are doing a t-test so see if the mean of group A and group B are the same
## The test is bayesian, we assume they are until proven otherwise with a significance value of 0.05 (5%)
t_statistic, p_value = stats.ttest_ind(group_A, group_B)
print(f"t-statistic: {t_statistic}, p-value: {p_value}")

trunc_p = '%.3f'%(100*p_value)

if p_value >= sig_value:
    print(f"Hypothesis of Group A and Group B means being equal is NOT REJECTED (significance value of {100*sig_value}%, p-value of {trunc_p}%)")
else:
    print(f"Hypothesis of Group A and Group B means being equal is REJECTED (significance value of {100*sig_value}%)")


In [None]:
## Here we do some more time-series analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Seed for reproducibility (we will create synthetic data)
np.random.seed(99)

# Generate synthetic data
n_samples = 100
dates = pd.date_range(start='2024-01-01', periods=n_samples, freq='B')  # Business days
price_changes = np.random.normal(loc=0, scale=1, size=n_samples)  # Random price changes
prices = np.cumsum(price_changes) + 100  # Simulated stock prices (random walk)

# Create DataFrame
df = pd.DataFrame({
    'Date': dates,
    'Close': prices
})

# Plot the time series
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Close'], label='Close Price')
plt.title('Synthetic Stock Closing Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

# Calculate and plot ACF
fig, ax = plt.subplots(figsize=(12, 6))
sm.graphics.tsa.plot_acf(df['Close'], lags=30, ax=ax)
plt.title('Autocorrelation Function (ACF)')
plt.xlabel('Lags')
plt.ylabel('ACF')
plt.show()


In [None]:
## Here we do some predictive modelling

import numpy as np
from sklearn.linear_model import LinearRegression

# Sample Data
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 3, 2.5, 4, 4.5])

# Train Model
model = LinearRegression().fit(X, y)
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")


In [None]:
## Here we have some code for option pricing (black-scholes)

import scipy.stats as si
import numpy as np

def black_scholes(S, K, T, r, sigma, option_type='call'):
    d1 = (np.log(S / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    if option_type == 'call':
        option_price = (S * si.norm.cdf(d1, 0, 1) - K * np.exp(-r * T) * si.norm.cdf(d2, 0, 1))
    elif option_type == 'put':
        option_price = (K * np.exp(-r * T) * si.norm.cdf(-d2, 0, 1) - S * si.norm.cdf(-d1, 0, 1))
    return option_price

# Example Parameters
S = 100  # Current stock price
K = 105  # Strike price
T = 1    # Time to maturity in years
r = 0.05 # Risk-free rate
sigma = 0.2 # Volatility

call_price = black_scholes(S, K, T, r, sigma, option_type='call')
print(f"Call Option Price: {call_price}")

In [None]:
## Here's some code of logistic regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample data
data = {
    'Open': [100.0, 101.0, 100.5, 101.2],
    'High': [102.0, 103.0, 102.5, 101.5],
    'Low': [98.0, 99.5, 99.0, 99.8],
    'Close': [101.0, 100.5, 101.5, 100.8],
    'Volume': [1500000, 1700000, 1800000, 1300000],
    'SMA_10': [99.5, 100.0, 100.2, 100.8],
    'RSI': [55.0, 52.5, 58.0, 59],
    'Label': [1, 0, 1, 0]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Features and target
X = df[['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'RSI']]
y = df['Label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Scale the features
scaler = StandardScaler(Y) # type: ignore
X_train_scaled = scaler.it_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LogisticRegression()
model.fit(X_train_scaled,y_train)
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Seed for reproducibility
np.random.seed(10)

# Generate synthetic data
n_samples = 1000
dates = pd.date_range(start='2024-01-01', periods=n_samples, freq='B')
open_prices = np.random.uniform(low=100, high=200, size=n_samples)
high_prices = open_prices + np.random.uniform(low=0, high=10, size=n_samples)
low_prices = open_prices - np.random.uniform(low=0, high=10, size=n_samples)
close_prices = open_prices + np.random.uniform(low=-5, high=5, size=n_samples)
volume = np.random.randint(low=100000, high=5000000, size=n_samples)

# Simple Moving Average (SMA) with window of 10
sma_10 = pd.Series(close_prices).rolling(window=10).mean().ffill()

# Relative Strength Index (RSI)
def calculate_rsi(series, window=14):
    delta = series.diff().ffill()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

rsi = calculate_rsi(pd.Series(close_prices))

# Label: 1 if next day's close price is higher, else 0
labels = np.where(np.roll(close_prices, -1) > close_prices, 1, 0)
labels[-1] = 0  # Last label cannot be determined

# Create DataFrame
df = pd.DataFrame({
    'Date': dates,
    'Open': open_prices,
    'High': high_prices,
    'Low': low_prices,
    'Close': close_prices,
    'Volume': volume,
    'SMA_10': sma_10,
    'RSI': rsi,
    'Label': labels
})

# Remove the last row as it doesn't have a valid label
df = df[:-1]

# Features and target
X = df[['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'RSI']]
y = df['Label']
t = df['Date']

# Split the data
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, y, t, test_size=0.33, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Data type: ",X_train_scaled.dtype )
print("Data type: ",y_train.dtype )
print("Dimensions: ",np.shape(X_train_scaled))
print("Dimensions: ",np.shape(y_train))

# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Plot some of the data to visualize
plt.figure(figsize=(12, 6))
plt.plot(dates, close_prices, label='Close Price')
plt.plot(dates, sma_10, label='SMA 10')
plt.title('Stock Prices and SMA 10')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
## Information store of how we run python script elsewhere

%run "D:\Finance Study\Python and MATLAB Code\Core.py"


In [None]:
# Here we try to make a signal indicator from the SMIIO (Stochastic Momentum Index ergodic Indicator plus SMI ergodic Oscillator) of a stock
# The code makes an SMI of the stock by calculating the True Strength Index (TSI) of the stock using a slow and a fast period. Also makes an
# (ergodic) Indicator by using an Exponential Moving Average (EMA) using the signal period. Finally, makes (ergodic) Oscillator signal by
# subtracting the Indicator from the TSI.



fast_period = 3 # In the freq of file (days)
slow_period = 15
signal_period = 3


df_smi = ta.momentum.smi(df_NVDA['Close'],fast_period,slow_period,signal_period)


smi_name = "_{}_{}_{}".format(fast_period,slow_period,signal_period)
#SMI + smi_name is the SMI of the stock
#SMIs + smi_name is the indicator made from signal line
#SMIo + smi_name is the oscillator made by SMI - SMIs


cols_plot = ["SMI"+smi_name,"SMIs"+smi_name,"SMIo"+smi_name]

df_smi[cols_plot].plot()

df_smi.fillna(0, inplace = True) # Removing NaN values

plt.figure(figsize=(12,6))
plt.plot(df_smi)

subtrac=df_smi.iloc[:,0] - df_smi.iloc[:,1] # This is the same as SMIo
#plt.figure(figsize=(12,6))
#plt.plot(subtrac)


df_smi = df_smi.reset_index() # Get date as a non index col to use for bar plot

plt.figure(figsize=(14,6))
plt.bar(df_smi["Date"],df_smi["SMIo"+smi_name],2,12,color=np.where(df_smi["SMIo"+smi_name] < 0, 'crimson', 'green'))


## BEAUTIFULSOUP FOR WEBSCRAPING IN PYTHON
## SELENIUM FOR dynamic html


In [None]:
# Here we try to make a simple backtesting simulation model where we use an indicator to test approximate success in trading over course of a day
# or longer. We use the SMIIO model and simulate a trade whenever a condition is reached (e.g. change from negative to positive).
# The trade is initially done as 1 stock trade per instance and we assess the percentage of success (profit made), and size of success.

fast = 3
slow = 15
sig = 4

df_smiio = ta.momentum.smi(df_NVDA['Close'],fast,slow,sig)

smiio_name = "_{}_{}_{}".format(fast,slow,sig)

df_smiio = df_smiio.reset_index()

#df_smiio.fillna(0, inplace = True) # Removing NaN values


for i in range(0,len(df_smiio['SMIo'+smiio_name])):

    if np.isnan(df_smiio['SMIo'+smiio_name][i]):
        continue

    ind_diff = df_smiio['SMIo'+smiio_name][i] - df_smiio['SMIo'+smiio_name][i-1]

    last_val = df_smiio['SMIo'+smiio_name][i-1]
    
    #print(df_smiio['SMIo'+smiio_name][i])
    #print(ind_diff)


In [None]:
## Here we will write basic functions to 'purchase' and 'sell' stocks, and a function to sum up the transactions for a profit/loss measurement
histCols = ['Date','Ticker','Volume','Value','TimingState']
global dfTradeHist
dfTradeHist = pd.DataFrame(columns = histCols)

#######
def PrepDataFrame(dataFrame) -> pd.DataFrame:
    """
    Make all necessary preparation to use the dataframe in model (NOT FULLY COMPLETE). Currently this is:
    - Moving any date/time outside of dataframe index
    - Changing date/time to a full date-time style (i.e. dates-only will also include time (00:00:00))
    """
    # Guard function to get indexData (usually date) as a non index col to use (if as index) 
    if 'Date' not in dataFrame.columns: 
        dataFrame = dataFrame.reset_index()
    
    # Converts possible date-only to date-time
    dataFrame['Date'] = pd.to_datetime(dataFrame.Date, format='%Y-%m-%d %H:%M:%S')

    # Convert date-time to string (seems to create more bugs)
    #dataFrame['Date'] = dataFrame['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    
    return dataFrame
#######

#######
## A FUNCTION TO REMOVE DATA? PROB NOT NEEDED
#######



#######
## A SET OF FUNCTIONS TO ANALYSE STOCK DATA, TRAJECTORY, VOLATILITY ETC.
#######


#######
## FUNCTION TO MODIFY AN ACCURACY/ANALYTICS MATRIX DIRECTLY (INSTEAD OF DOING IT WITHIN THE MODEL ITSELF) FOR INTEROPERABILITY WITH MODEL EVALUATOR
#######



#######
def ResetTradeHist(name = 'dfTradeHist', cols = ['DateTime','Ticker','Volume','Value','TimingState']):
    """ Reset the trade history dataframe and remakes its columns (cols, if specified). """    
    globals()[name] = pd.DataFrame(columns = cols)
    return
#######

#######
def MakeTrade (dateTime, volume, ticker, history = None, buy = True, timingState = 'Close', declare = False) -> [[float, str, float, float, str], pd.DataFrame]:
    """
    Simulates making a specified trade and records it (using a sub-routine). Ticker price dataframe must have 'df_' prefix
    """
    # Default history input save status (take dfTradeHist as default storage dataframe)
    isDefaultHist = False
    if history is None:
        global dfTradeHist
        history = dfTradeHist.copy()
        isDefaultHist = True
    
    
    datasetName = 'df_'+ticker

    # The 1st part gets the global dataset, 2nd searches the datetime and 3rd gets the timings state price
    unitValue = globals()[datasetName][ globals()[datasetName]['Date'] == dateTime ][timingState].values[0]
    
    totalValue = -unitValue * volume
    transaction = 'bought'
    
    if not buy:
        transaction = 'sold'

    if declare == True:
        print(f"{volume} {ticker} stock(s) {transaction} at total price {-totalValue} (unit price {unitValue}) at {dateTime}")

    if not buy:
        totalValue = -totalValue

    result = [dateTime, ticker, volume, totalValue, timingState]
    
    ## Store in history
    history = RecordAction(result, history)

    # Default history location (if None then it won't update the original dataframe otherwise)
    if isDefaultHist:
        dfTradeHist = history.copy()
    
    return [[dateTime, ticker, volume, totalValue, timingState], history]
#######
#######
def RecordAction(action, dataFrame) -> pd.DataFrame:
    """
    Records action taken into a history dataframe.
    Note: Make sure the action row size is the same as the column size in the dataFrame.
    For each column take action indices and add to the existing history dataFrame.
    """
    
    # A slightly different process for first action recorded to avoid python warning
    if dataFrame.size == 0: # If no data
        
        print('Recording first input into trade history')
        dataFrame = pd.DataFrame(columns = dataFrame.columns)

    # Adding all actions into a dictionary to append to dataFrame
    appendDict = {}
    
    for i in range(0,len(action)):
        appendDict[dataFrame.columns[i]] = action[i]
    appendSeries = pd.Series(appendDict)    
    
    return pd.concat([dataFrame,appendSeries.to_frame().T], ignore_index=True)
#######
#######
def EvaluateModel(dataFrame, *arguments, **keywords) -> float:
    """
    This function is supposed to take the trade/call history and evaluate its properties,
    such as success (profit/loss) and other properties (e.g. profit vs. loss freq. etc.).
    """
    # Additional things to implement:
    # Exchange rate of currency effects
    # Trading costs
    
    
    print('------------------')
    print('Evaluating current algo model...')
    
    # Stratifying data based on ticker
    stratDF = dataFrame.groupby('Ticker').apply(lambda x: x)
    stratDF = stratDF.drop(columns=['Ticker'])

    # Extended output
    if any(val == 'Extended' for val in keywords.values()):
        print('List of trades to be evaluated:')
        print(stratDF)


    # Make a DF of compiled stats
    basicStatsCols = ['LongVol','ShortVol','RemainVol','Cost','Income','Profit']
    compileDF = pd.DataFrame(columns = basicStatsCols)
    

    # Takes list of the (multi-level) index of DF (tuple) and converts to
    # dictionary for unique 'Ticker' key 
    print('List of tickers traded:')
    for key in dict(stratDF.index.tolist()).keys():
        print(key)
        # key is the different ticker names

    # Implement the evaluation for each ticker traded here
    for key in dict(stratDF.index.tolist()).keys():
        # stratDF.loc[key] is the different ticker trades each in own DF
        tickerDF = stratDF.loc[key]

        # Extended output
        for value in keywords.values():
            if value == 'Verbose':
                print('Trade history of ticker ' + key +':')
                print(tickerDF)

        sumVal = tickerDF['Value'].sum()
        
        ## Basic stats
        sumShortVol = 0
        sumCost = 0 # Cost is for purchasing the share etc.
        sumLongVol = 0
        sumIncome = 0 # Income is obtained by selling
        for i in range(0,len(tickerDF['Value'])):
            if tickerDF['Value'].iloc[i] > 0:
                sumShortVol = sumShortVol + tickerDF['Volume'].iloc[i]
                sumIncome = sumIncome + tickerDF['Value'].iloc[i]
            else:
                sumLongVol = sumLongVol + tickerDF['Volume'].iloc[i]
                sumCost = sumCost - tickerDF['Value'].iloc[i]
        profit = sumIncome - sumCost
        specProfit = profit*2/(sumShortVol+sumLongVol)
    
        # To implement a existing position closing system (to evaluate the model more accurately)
        remainVol = sumLongVol - sumShortVol
        if remainVol != 0:
            print(Fore.RED)
            print(' !!!!!!!!!!')
            print('There is an existing open position in ' + key + '! This may impact the accuracy of the model evaluation.')
            if value == 'Extended':
                if remainVol > 0:
                    print('Open position size: ' + str(remainVol) + ' shares long.')
                else:
                    print('Open position size: ' + str(-remainVol) + ' shares short.')
            
            print('Existing positions will be closed.')
            print(' !!!!!!!!!!')
            print(Fore.BLACK)
        
            # Closing existing positions using last unit price
            datasetName = 'df_' + key
            
            # Portfolio value is positive if long (remain vol > 0)
            portValue = remainVol * globals()[datasetName]['Close'][len(globals()[datasetName]['Close'])-1]
            
            finalProfit = profit + portValue
            if portValue > 0:
                finalCost = sumCost
                finalIncome = sumIncome + portValue
            else:
                finalCost = sumCost + portValue
                finalIncome = sumIncome
    
        ## Advanced stats 



        # Adding all basic stats for each ticker into an array then converting into dictionary to append to dataFrame
        # basicStats uses 'final' position if unclosed     
        basicStats = []        
        if remainVol != 0:
            basicStats = [sumLongVol, sumShortVol,remainVol,finalCost,finalIncome,finalProfit]
        else:
            basicStats = [sumLongVol, sumShortVol,remainVol,sumCost,sumIncome,profit]

        # Append dictionary for dataFrame
        appendDict = {}
        for i in range(0,len(basicStats)):
            appendDict[compileDF.columns[i]] = basicStats[i]

        compileDF = compileDF._append(appendDict, ignore_index=True)
    
        # Currently only evaluates the profit/loss levels (not including closing existing positions)
        print('===================\n\nCalculating basic evaluation stats in ticker '+key+':')
        
        print('Current profit/loss stats (not closing existing positions):')
        print('Total profit from all trades: ' + str(sumVal))
        print('Total cost: ' + str(sumCost))
        print('Total income: ' + str(sumIncome))
        print('Total profit: ' + str(profit))
        
        print('Profit Margin: ' + str(profit/sumCost))
        print('Specific Profit (per volume traded): ' + str(specProfit)) # Calculated as avg. of long and short vol. (if shares outstanding)
    
        if remainVol != 0:
            print(Fore.BLUE)
            print('Current profit/loss stats (after closing existing positions):')
            print('Remaining portfolio value (before close): ' + str(portValue))
            print('Final cost: ' + str(compileDF['Cost'].sum()))
            print('Final income: ' + str(compileDF['Income'].sum()))
            print('Final profit from all trades: ' + str(compileDF['Profit'].sum()))
            
            print('Final Profit Margin: ' + str(compileDF['Profit'].sum()/compileDF['Cost'].sum()))
            print(Fore.BLACK)
        
        print('===================')


    if any(val == 'AddEval' for val in keywords.values()):
        for input in arguments: # Do not need to use *args this way but I chose to, its fine until I want to use more than 1 args dataframe
            if isinstance(input, pd.DataFrame):

                ## TO ADD TRADING ANALYTICS HERE, THIS SHOULD ONLY TAKE IN OUTPUT FROM ELSEWHERE
                ## SHOULD CREATE FUNCTIONS TO DIRECTLY MODIFY ACCURACY MATRIX INSTEAD OF MAKING WITHIN EACH MODEL
                ## SHOULD ALSO CREATE STOCK ANALYSER METHOD SEPARATELY

            
                print(Fore.GREEN)
                print('#################################\nCalculating additional model evaluations from given accuracy dataFrame:')
                print('Model efficiency stats:')
                print('Trade accuracy (trades being incorrect by instance, not volume?): (NOT IMPLEMENTED YET)')
                print('Trade efficiency (how many trades are not correct?): (NOT IMPLEMENTED YET)')
                print('Model Loss (how far are trades from actual optimal points): (NOT IMPLEMENTED YET)')
                print('#################################')
                print(Fore.BLACK)





    print('------------------')
    return 1

df_NVDA = PrepDataFrame(df_NVDA)
df_MSFT = PrepDataFrame(df_MSFT)
df_KO = PrepDataFrame(df_KO)

[NVDA_L1,_] = MakeTrade('2019-07-01 00:00:00', 10.5, 'NVDA')
[MSFT_L1,_] = MakeTrade('2019-10-01 00:00:00', 6, 'MSFT')
[NVDA_S1,_] = MakeTrade('2022-07-01 00:00:00', 10.5, 'NVDA', buy = False)
[NVDA_S2,_] = MakeTrade('2023-06-01 00:00:00', 100, 'NVDA', buy = False)
[NVDA_L2,_] = MakeTrade('2023-08-01 00:00:00', 99, 'NVDA', buy = True)
[MSFT_S1,_] = MakeTrade('2022-07-01 00:00:00', 6, 'MSFT', history = None, buy = False)

num = EvaluateModel(dfTradeHist, depth = 'AddEval', debug = 'Simple')


In [None]:
# Creating sub-routines for models here

# Here we will create a sub-routine that provides conditions for specified trades when a condition is met.
# The basic one will be when a set of values becomes positive (from a negative/zero value).


def WhenPositive(dataset,searchData,indexData):
    """
    This function returns set of independent variables (outData, e.g. date/time) in the selected data
    to be searched (indexData) from the dataset, when searchData (e.g. price) turns positive from negative.
    searchData and indexData are strings of the column names.
    """
    
    # Indexed (observed) dataset (independent var.)
    obsDataset = dataset[indexData]

    # Searched dataset (dependent var.)
    searchDataset = dataset[searchData]

    # Outputted dataset array (indep. var.)
    outDataset = []

    # Goes through all points to get when turn positive
    for i in range(1,len(obsDataset)):
        # range starts from 1 because need the i-1'th datapoint
        # If negative, irrelevant so skip step
        if searchDataset.iloc[i] <= 0:
            continue

        # If positive but last step negative, record
        if searchDataset.iloc[i-1] < 0:
            outDataset = outDataset + [obsDataset.iloc[i]]

    # Note output is an array for efficiency
    return outDataset

def WhenNegative(dataset,searchData,indexData):
    """
    This function returns set of independent variables (outData, e.g. date/time) in the selected data
    to be searched (indexData) from the dataset, when searchData (e.g. price) turns negative from positive.
    searchData and indexData are strings of the column names.
    Same as WhenPositive but reverse as you cant just use -dataset.
    """
    
    # Indexed (observed) dataset (independent var.)
    obsDataset = dataset[indexData]

    # Searched dataset (dependent var.)
    searchDataset = dataset[searchData]

    # Outputted dataset array (indep. var.)
    outDataset = []

    # Goes through all points to get when turn negative
    for i in range(1,len(obsDataset)-1):
        # range starts from 1 because need the i-1'th datapoint
        # range ends at len(obsDataset)-1 if last datapoint is 'live' and pending update
        # If positive, irrelevant so skip step
        if searchDataset.iloc[i] >= 0:
            continue

        # If negative but last step positive, record
        if searchDataset.iloc[i-1] > 0:
            outDataset = outDataset + [obsDataset.iloc[i]]

    # Note output is an array for efficiency
    return outDataset

def JumpChecker(dataset, searchData, indexData, jumpThresh, jumpPeriod = 1, signalPeriod = 1):
    """
    This function returns set of independent variables (outData, e.g. date/time) in the selected data
    to be searched (indexData) from the dataset, when searchData (e.g. price) jumps (or changes) a
    certain percentage (jumpThresh) on average of a (signalPeriod) time period, from its last (jumpPeriod)
    time periods ago.
    searchData and indexData are strings of the column names.
    """

    # Error input guard (fixes negative input values for real numbers)
    jumpThresh = abs(jumpThresh)
    jumpPeriod = abs(jumpPeriod)
    
    # Indexed (observed) dataset (independent var.) - dataset[indexData]
    # Searched dataset (dependent var.) - dataset[searchData]

    # Outputted dataset array pair (independent var., and bool) [['Time'],['Jump?']]
    outDataset = []
    jumpTime = []
    isJump = []
    
    # Get rolling simple moving average of dataset (SMA as signal safety vs instant drop/jumps or spread drop/jumps)
    copyDataset = dataset.copy() # Make shallow copy to not bloat original dataset
    copyDataset['SMA ' + searchData] = copyDataset[searchData].rolling(window = signalPeriod).mean()
    
    # Goes through all points to get when turn negative
    for i in range(jumpPeriod,len(dataset[indexData])-1):
        
        # Range starts from jumpPeriod because SMA starts from jumpPeriod-1'th datapoint (as index starts at 0),
        # but need the value before jumpPeriod (so +1) since that is the drop signal datapoint. 
        # Range ends at len(obsDataset)-1 if last datapoint is 'live' and pending update


        # Error guard function (0 val input), skip step
        if dataset[searchData][i-jumpPeriod] == 0:
            continue

        # Jump ratio is the [amount at time-index i] - [amount at i-jumpPeriod (pre-jump) time] / pre-jump value (normalisation) 
        jumpRatio = (copyDataset['SMA ' + searchData][i] - copyDataset['SMA ' + searchData][i-jumpPeriod])/copyDataset['SMA ' + searchData][i-jumpPeriod]
        #jumpRatio = (dataset[searchData][i] - dataset[searchData][i-jumpPeriod])/dataset[searchData][i-jumpPeriod]
        
        # Check for value jump (above jumpThresh) or drop (below -jumpThresh), if so, record point
        if jumpRatio > jumpThresh:
            #print('Value jump detected: ' + str(jumpRatio*100) + '%.')
            # Record data (as jump)
            jumpTime = jumpTime + [dataset[indexData][i]]
            isJump = isJump + [True]

            #outDataset = outDataset + [[dataset[indexData][i], True]]

        elif jumpRatio < -jumpThresh:
            #print('Value drop detected: ' + str(-jumpRatio*100) + '%.')
            # Record data (as drop)
            jumpTime = jumpTime + [dataset[indexData][i]]
            isJump = isJump + [False]
            
            #outDataset = outDataset + [[dataset[indexData][i], False]]

    outDataset = [jumpTime, isJump]
    # Note output is an array for efficiency
    return outDataset


In [None]:
## Testing playground to add into the model evaluator


# Note: using frozenset() for checks can make it faster for big datasets

In [None]:

def SMIFlipTradeModel(ticker, fast_period, slow_period, signal_period, timingState = 'Close'):
    """This trade model trades when SMIIO of ticker becomes ('flips') to positive (long) and negative (short)."""

    global histCols
    # Create trade history dataframe
    tradeHist = pd.DataFrame(columns = histCols)
    
    # Get and prep relevant dataFrame
    df_ticker = PrepDataFrame(globals()['df_'+ticker])

    # Apply SMI model to get result dataframe (and add date column)
    df_smi = ta.momentum.smi(df_ticker['Close'],fast_period,slow_period,signal_period)
    df_smi['Date'] = df_ticker['Date'].copy()
    
    smiConfigName = "_{}_{}_{}".format(fast_period,slow_period,signal_period)
    # SMI + smi_name is the SMI of the stock
    # SMIs + smi_name is the indicator made from signal line
    # SMIo + smi_name is the oscillator made by SMI - SMIs
    
    smiTypeNames = ["SMI"+smiConfigName,"SMIs"+smiConfigName,"SMIo"+smiConfigName]
    
    # Removing NaN values
    df_smi.fillna(0, inplace = True) 
    
    buyFlip = WhenPositive(df_smi,smiTypeNames[2],'Date')
    sellFlip = WhenNegative(df_smi,smiTypeNames[2],'Date')

    # Currently only trade one stock per instance, can be made variable.
    for i in range(0,len(buyFlip)):
        [maketrade, tradeHist] = MakeTrade(buyFlip[i].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist)
    
    for i in range(0,len(sellFlip)):
        [maketrade, tradeHist] = MakeTrade(sellFlip[i].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist, buy = False)
    
    eval = EvaluateModel(tradeHist, depth = 'AddEval', debug = 'Extended')
    
    return tradeHist

def SuddenChangeTradeModel(ticker, changeThresh = 0.05, changePeriod = 1, revertRatio = 0.9, safetyPeriod = 0, timingState = 'Close'):
    """
    This model trades when a ticker suddenly changes up (short) or down (long) by a certain amount
    (Thresh), and 'reverts' position when it goes back up.
    
    Details: Can revert the up or down jumps partially using a setting (revertRatio), and can add safety factors
    to the initation signal (safetyPeriod). Currently only takes one ticker.
    """
    
    # Create trade history (global, local seems to make issues)
    global histCols
    tradeHist = pd.DataFrame(columns = histCols)
    
    # Get relevant dataFrame and pre-process it
    df_ticker = globals()['df_'+ticker].copy()
    df_ticker = PrepDataFrame(df_ticker)

    # Obtain timings of significant price changes ([Date of change ending, bool if its jump (up)])
    changeTimings = JumpChecker(df_ticker, timingState, 'Date', changeThresh, jumpPeriod = changePeriod, signalPeriod = safetyPeriod)
    # Make dummy ticker to establish repurchase date
    df_dummy = copy.deepcopy(df_ticker)
    
    # Currently only trade one stock per instance, can be made variable.
    for i in range(0,len(changeTimings[0])):
        
        # If jump
        if changeTimings[1][i]:
            # Sell now, buy later
            [makeTrade, tradeHist] = MakeTrade(changeTimings[0][i].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist, buy = False)

        else: # If drop
            # Buy now, sell later
            [makeTrade, tradeHist] = MakeTrade(changeTimings[0][i].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist)

    
    # Get the time and value of stock pre-jump after all 'procedure starting' trades are done (to be used for revert trades)
    # Sudden change is generally referred to as Jump here for simplicity
    # This is done at end to not enlarge list during iterations

    # This is a list comprehension; result (preJumpIndex) is left (i - changePeriod) for output (i, _)
    # (i is index num, x/t is value itself) in input variable. Can also add boolean
    # condition after (i.e. if tradeHist['Date'] = 01-01-2023)
    jumpTime = [x for _, x in enumerate(tradeHist['Date'])] # Note x is in str not timestamp
    jumpIndex = [i for i, t in enumerate(df_ticker['Date']) if t.strftime('%Y-%m-%d %H:%M:%S') in jumpTime]
    jumpValue = [df_ticker[timingState][i] for i in jumpIndex]
    preJumpValue = [df_ticker[timingState][i - changePeriod] for i in jumpIndex]

    revertValue = [jumpValue[i] + revertRatio*(preJumpValue[i] - jumpValue[i]) for i in range(0,len(jumpValue))]

    # Get (spot - revert) values and check for negative/positive first swap point for each initiation trade and
    # make a reversion trade at that point.
    # We will put each reversion point of a trade on the initiation trade time in the dataframe
    df_dummy['ReversionVal'] = None
    df_dummy['ReversionDate'] = None
    df_dummy['UnclosedTrade'] = None # To highlight unclosed trades
    
    for i in range(0,len(jumpIndex)):
        # i is out of the number of jumps traded, j is the index (in the dataframe) where the trade is done.
        j = jumpIndex[i]

        # We use .loc to get j'th point in 'Reversion' column as it takes the variable in memory and not its mirror/copy.
        df_dummy.loc[j,('ReversionVal')] = revertValue[i]
        # df_dummy['Reversion'][i] is CHAINED INDEXING and won't work because it calls
        # df_dummy.__getitem__('Reversion).__setitem__(i) = ... which may not be applied to df_dummy location
        # in memory layout (as getitem) and be thrown out immediately. But, .loc dodges this by having __setitem__ only.
        # Note: this wouldnt be an issue if the chained indexing happened on the other side (unless doing assignment?).

        # The spot - reversion values (i'th value happens at index j within dataframe)
        df_dummy['DistToRev'] = None
        df_dummy['DistToRev'] = df_dummy['Close'][j:] - revertValue[i]
        
    
        # Distance (of value at first trade) to reversion point is positive if jump (as immediately greater than
        # reversion point), and negative if drop (immediately below reversion point).
        # Thus, depending on first value we know what if buy or sell first, then select if WhenPositive or WhenNegative
        if df_dummy['DistToRev'][j] > 0:
            # Buy back now
            # To scan and find the first point distance-to-reversion value changes sign (becomes negative)
            scanRange = range( j, len(df_dummy['DistToRev']) )
            reverseTrade = WhenNegative(df_dummy.iloc[scanRange],'DistToRev','Date')
            if len(reverseTrade) != 0:
                [makeTrade, tradeHist] = MakeTrade(reverseTrade[0].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist)
            else:
                print('Unclosed short detected at time: ' + df_dummy['Date'][j].strftime('%Y-%m-%d %H:%M:%S'))
                print('Unclosed trades can cause significant losses! Initial trade elected to be cancelled.')
                df_dummy.loc[j,('UnclosedTrade')] = df_dummy['Close'][j]
                [makeTrade, tradeHist] = MakeTrade(changeTimings[0][i].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist)


        elif df_dummy['DistToRev'][j] < 0:
            # Sell back now
            # To scan and find the first point distance-to-reversion value changes sign (becomes positive)
            scanRange = range( j, len(df_dummy['DistToRev']) )
            reverseTrade = WhenPositive(df_dummy.loc[scanRange],'DistToRev','Date')
            if len(reverseTrade) != 0:
                [makeTrade, tradeHist] = MakeTrade(reverseTrade[0].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist, buy = False)
            else:
                print('Unclosed long detected at time: ' + df_dummy['Date'][j].strftime('%Y-%m-%d %H:%M:%S'))
                print('Unclosed trades can cause significant losses! Initial trade elected to be cancelled.')
                df_dummy.iloc[j,('UnclosedTrade')] = df_dummy['Close'][j]
                [makeTrade, tradeHist] = MakeTrade(changeTimings[0][i].strftime('%Y-%m-%d %H:%M:%S'), 1, ticker, history = tradeHist, buy = False)

        else:
            raise ValueError('The immediate distance of price to reversion trade point should not be 0 or NaN.')

    # Display graphical data of the model (when trades were done, size of trades). (Also try to display the reversion datasets.)
    print(df_dummy.to_string())
    plt.figure(figsize=(12,6))
    plt.plot(df_dummy['Date'],df_dummy['ReversionVal'])
    plt.plot(df_dummy['Date'],df_dummy['Close'])
    plt.plot(df_dummy['Date'],df_dummy['UnclosedTrade'])
    

    eval = EvaluateModel(tradeHist, df_dummy, depth = 'AddEval', debug = 'Extended')

    # AFTER THIS CREATE A MODEL CLASS AND USE IT TO CREATE A CUSTOM MODEL TYPE VARIABLE WITH CALLABLE OUTPUT VALUES
    # AND CUSTOMISABLE INPUTS
    
    return tradeHist




In [None]:
data = SuddenChangeTradeModel('NVDA', 0.1, 10, 0.75, 2)

#data = SMIFlipTradeModel('NVDA', 3, 10, 5)


###
###
# The code seems to work as intended but I will check again, but the method seems to not work. There could be at least 3-5 reasons.
# 1) Selling the buying (and vice versa) does not work if the stock trajectory long-term is upward as there will be cases where you cannot sell again
# (and when forced to close position, are at a loss).
# 2) Linked to 1) the periodicity of the model application means if applied at the wrong timeframe (granularity) i.e. each datapoint is day and
# not hour or 15 mins, it causes loss as the trajectory of the model is more refined and there will be more dips and peaks.
# 3) The code should be also tested for models of different trajectories (long-term, not related to granularity), as perhaps that influences the
# outcome more than the effect of granularity.
# 4) A general loss-stop missing, maybe the biggest losses are due to a lack of loss-stop method and a bleed in the earlier trades.
# 5) Perhaps the method itself is statistically bad/incorrect (i.e. when a jump happens the stock is likelier than not to keep going up and not
# reverting)
###
# Note that removing unclosed trades fixes all issues as hypothesized. Max. profit margin is approx. jumpRatio*reversionRatio. But issue is we cannot
# know in advance if there will be unclosed trades unless we know the trajectory of the stock or if we use a stop-loss
# need to record if trade is incomplete, add statistical modelling as well to assess shortfall?

In [None]:
##########
## THIS CODE IS SHELVED, NO NEED TO USE (CODE DISABLED, CAN REENABLE WITH THE BOOL SETTINGS BELOW)


## Here we scrape data from websites or from yahoo finance api

# Have them here at the end as they are not being used currently but may have some use
# Examples of future use are: sentiment analysis, or low temporal resolution results
# Also raw code so I do not need to immediately go through using data manifest class

#############
# User settings (to avoid looking for lines and changing manually)
# Enable various scraping mechanisms
enableSelenium = False
enableyFinance = False
enableTVS = False
enableAlphavantage = False

# Quit chrome after selenium use complete
chromeQuit = True

# Export data to a file?
export = True

###########
# Taking direct data using Alphavantage's API of intraday and daily values
if enableAlphavantage:
    # Alphavantage API key (intraday upto a month length each time, limited daily request, free), no scraping
    # Something like '69SFCX93J1H8V9K0'

    # Documentation for API here: https://www.alphavantage.co/documentation/

    ### Extract data manifest from folder and ignore existing datasets in API request
    # Load manifest file (if exists, if not, create empty file)
    try:
        dataManifestFile = open(r'StockHistData\dataManifest.txt',"r")
    except FileNotFoundError:
        dataManifestFile = open(r'StockHistData\dataManifest.txt',"w")

    # EXTRACT JSON AND 
    #textdata = dataManifestFile.read()

    
    #print(textdata) # NEED TO HAVE CODE TO CONVERT OUTPUT STRAIGHT INTO A DF OR
    dataManifestFile.close()
    
    # Here we scrape past intraday stock data (not interested in testing current prices as can test it later by making it past :D )
    dataManifestFile = open(r'StockHistData\dataManifest.txt',"a")
    
    for symbol in dltickers:
        for month in dlmonths:
            for interval in dlintervals:
                # Check data file here
                ## NEED TO CHECK FILE DATA HERE, IF NO FILE, NEED TO CATCH NOFILEERROR AND CREATE NEW ONE AND BLANK MIDF
                # READ FILE, IF DOESN'T EXIST, DOWNLOAD DATA AND SAVE AND ADD TO MANIFEST
                
                alphaURL = rf"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={symbol}&interval={interval}min&month={month}&outputsize=full&apikey={alphaAPIkey} "
                print(alphaURL)
                r = requests.get(alphaURL)
                data = r.json()

                # We add this dataset to the manifest, and save the data itself
                #dataManifestFile.write()
    
    dataManifestFile.close()
                
    scrapeDF = pd.DataFrame.from_dict(data, orient='columns')
    print(scrapeDF)
    # Check if dataset exists before making request (to avoid wasting limited daily calls)

    
    #dataManifestFile.close()
    



##########
# Selenium scraper to get intra-day prices (not completed so doesn't get intraday prices)
# May need higher level expertise to extract data from jscript objects 
if enableSelenium:
    DRIVERPATH = r"D:\Finance Study\chromedriver-win64\chromedriver-win64\ "
    
    # Set Chrome options
    options = Options()
    options.headless = False #True # Enable headless mode (no GUI)
    options.add_argument("--window-size=1920,1200")  # Set the window size
    
    
    # Init Chrome driver (I guess it's a semi-manual task?)
    driver = webdriver.Chrome()#executable_path = DRIVERPATH)
    
    # Navigate to the desired page
    for url in yUrls:
        print("==================")
        driver.get(r''+url)
        time.sleep(5)
    
    
    # Testing here (to develop interaction code here)
    #print(driver.page_source)
    
    
    # Good practice to quit when done
    if chromeQuit: driver.quit()

##########
# yFinance to scrape Yahoo Finance
if enableyFinance:
    # yFinance (Yahoo Finance Historical Data (daily))
    # Ticker object array
    tickObjArr = [yf.Ticker(ticker) for ticker in dltickers]
    
    # Fetch historical data
    tframe = "5d"#"1mo"#"1y"
    histData = [tickObj.history(period = tframe) for tickObj in tickObjArr]
    for i in range(len(histData)):
        print("Historical data for " + tickObjArr[i].ticker + ":")
        print(histData[i])
    
    
    # Fetch basic financial data
    finData = [tickObj.financials for tickObj in tickObjArr]
    for i in range(len(finData)):
        print("Basic Financial data for " + tickObjArr[i].ticker + ":")
        print(finData[i])
    
    # Fetch stock actions like dividends and splits
    actionData = [tickObj.actions for tickObj in tickObjArr]
    for i in range(len(actionData)):
        print("\nStock Actions for " + tickObjArr[i].ticker +  ":")
        print(actionData[i])
    
    # Using soup
    yUrls = [ f'https://finance.yahoo.com/quote/{ticker}/' for ticker in dltickers
    ]
    #print(urls)
    r = requests.get(url=yUrls[0], headers=user_header)
    #print(r.content)
    
    soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
    #print(soup.prettify())
    #table = soup.find('div',)
    
    company = soup.find('h1', {'class': 'yf-xxbei9'}).text
    #print(company)
    closePrice = soup.find('div', {'class': 'stx-btn-panel stx-show'})
    print(closePrice)
    closePrice = soup.find('span', {'class': 'stx-ico-close'})
    print(closePrice)



##########
# Tradingview Scraper
if enableTVS:
    
    # Ideas scraper
    # Ideas are the tab in the webpage with articles of sorts
    # Initialize the Ideas scraper with default parameters
    
    # Default: export_result=False, export_type='json'
    ideas_scraper = Ideas(
      export_result=True,  # Set to True to save the results
      export_type='csv'    # Specify the export type (json or csv)
    )
    
    # Default symbol: 'BTCUSD'
    # Scrape ideas for the NVDA symbol, from page 1 to page 1
    ideas = ideas_scraper.scrape(
      symbol="NVDA",
      startPage=1,
      endPage=1,
      sort="popular"  #  Could be 'popular' or 'recent'
    )
    
    #print("Ideas:", ideas)
    
    
    
    ##########
    # Indicators
    from tradingview_scraper.symbols.technicals import Indicators
    
    # Scrape all indicators for the BTCUSD symbol
    indicators_scraper = Indicators(export_result=True, export_type='json')
    indicators = indicators_scraper.scrape(
        symbol="BTCUSD",
        timeframe="4h",
        allIndicators=True
    )
    #print("All Indicators:", indicators)
