# Imports

In [9]:
import kagglehub
import shutil
import os

# Data from TwelveData

First you have to create an account [here](https://twelvedata.com) and the generate a key [here](https://twelvedata.com/account/api-keys). Place the key in a file named .env and make sure there exists this field:

> ⚠️ **Important:** Do **not** share this key or commit the `.env` file to version control. It's already excluded in `.gitignore`.

> TD_API_KEY=<yllaertubyrtecinreadbackwards>

In [1]:
from twelvedata import TDClient
from dotenv import load_dotenv
import os

In [9]:
load_dotenv()
api_key = os.getenv("TD_API_KEY")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")

In [3]:
# Initialize client
td = TDClient(apikey=api_key)

# Fetch data
ts = td.time_series(
    symbol="AAPL",
    interval="1min",
    outputsize=100,
    timezone="America/New_York",
)

df = ts.as_pandas()
print(df.head())


                          open     high        low      close  volume
datetime                                                             
2025-05-09 15:59:00  198.45500  198.550  198.39999  198.53999  787957
2025-05-09 15:58:00  198.46880  198.500  198.38000  198.45010  199966
2025-05-09 15:57:00  198.38000  198.510  198.37869  198.47000  125862
2025-05-09 15:56:00  198.49500  198.495  198.34010  198.37981  111684
2025-05-09 15:55:00  198.75999  198.765  198.42999  198.50999  119482


In [4]:
df.reset_index().to_csv("data/raw/AAPL_1min.csv", index=False)

In [5]:
# Fetch daily time series data for SPY from 2024-01-01 to 2024-12-31
ts = td.time_series(
    symbol="SPY",
    interval="1day",
    start_date="2024-01-01",
    end_date="2024-12-31",
    timezone="America/New_York"
)

# Convert the time series data to a pandas DataFrame
df = ts.as_pandas()

# Display the first few rows of the DataFrame
print(df.head())

# Optional: Save the DataFrame to a CSV file
df.to_csv("data/raw/SPY_2024_daily.csv", index=True)

                 open       high        low      close    volume
datetime                                                        
2024-12-30  587.89001  591.73999  584.40997  588.21997  56578800
2024-12-27  597.53998  597.78003  590.76001  595.01001  64969300
2024-12-26  599.50000  602.47998  598.08002  601.34003  41219100
2024-12-24  596.06000  601.34003  595.46997  601.29999  33160100
2024-12-23  590.89001  595.29999  587.65997  594.69000  57635800


In [11]:
# Fetch daily time series data for SPY from 2024-01-01 to 2024-12-31
ts = td.time_series(
    symbol="SPY",
    interval="1month",
    start_date="2024-01-01",
    end_date="2024-12-31",
    timezone="America/New_York"
)

# Convert the time series data to a pandas DataFrame
df = ts.as_pandas()

# Display the first few rows of the DataFrame
print(df.head())

# Optional: Save the DataFrame to a CSV file
df.to_csv("data/raw/SPY_2024_monthly.csv", index=True)

                 open       high        low      close      volume
datetime                                                          
2024-12-01  602.96997  609.07001  580.90997  586.08002  1059516700
2024-11-01  571.32001  603.34998  567.89001  602.54999   901843000
2024-10-01  573.40002  586.12000  565.27002  568.64001   976068800
2024-09-01  560.46997  574.71002  539.44000  573.76001  1045061400
2024-08-01  552.57001  564.20001  510.26999  563.67999  1244599000


In [12]:
# Fetch daily time series data for SPY from 2024-01-01 to 2024-12-31
ts = td.time_series(
    symbol="BA",
    interval="1month",
    start_date="2024-01-01",
    end_date="2024-12-31",
    timezone="America/New_York"
)

# Convert the time series data to a pandas DataFrame
df = ts.as_pandas()

# Display the first few rows of the DataFrame
print(df.head())

# Optional: Save the DataFrame to a CSV file
df.to_csv("data/raw/BA_2024_monthly.csv", index=True)

                 open       high        low      close     volume
datetime                                                         
2024-12-01  155.91000  182.57001  153.37000  177.00000  238381100
2024-11-01  152.78000  157.66000  137.03000  155.44000  269611100
2024-10-01  151.46001  163.44000  146.02000  149.31000  339046200
2024-09-01  167.03000  169.60001  151.24001  152.03999  191886200
2024-08-01  190.00000  191.17000  162.50000  173.74001  128582400


In [13]:
# Fetch daily time series data for SPY from 2024-01-01 to 2024-12-31
ts = td.time_series(
    symbol="BA",
    interval="1day",
    start_date="2024-01-01",
    end_date="2024-12-31",
    timezone="America/New_York"
)

# Convert the time series data to a pandas DataFrame
df = ts.as_pandas()

# Display the first few rows of the DataFrame
print(df.head())

# Optional: Save the DataFrame to a CSV file
df.to_csv("data/raw/BA_2024_daily.csv", index=True)

                 open       high        low   close    volume
datetime                                                     
2024-12-30  173.72000  178.17000  170.14999  176.55  18082300
2024-12-27  180.00999  181.42999  179.39999  180.72   6806900
2024-12-26  178.98000  182.57001  178.33000  180.38   5905700
2024-12-24  177.69000  180.95000  177.50000  179.34   4317000
2024-12-23  178.12000  179.64999  174.28999  177.69   8486400


# Stockmarket data

# Connecting to the DB

In [6]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume'], dtype='object')

In [7]:
pip install pandas psycopg2-binary sqlalchemy

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl.metadata (4.8 kB)
Collecting sqlalchemy
  Downloading sqlalchemy-2.0.40-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.2-cp313-cp313-win_amd64.whl.metadata (4.2 kB)
Collecting typing-extensions>=4.6.0 (from sqlalchemy)
  Using cached typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Downloading psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ------------------------ --------------- 1.6/2.6 MB 15.5 MB/s eta 0:00:01
   ---------------------------------------- 2.6/2.6 MB 18.3 MB/s eta 0:00:00
Downloading sqlalchemy-2.0.40-cp313-cp313-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 2.1/2.1 MB 57.4 MB/s eta 0:00:00
Downloading greenlet-3.2.2-cp313-cp313-win_amd6

In [10]:
import pandas as pd
import psycopg2
from psycopg2.extras import execute_values

# Database connection parameters
db_config = {
    'host': '192.168.1.68',
    'port': 5432,
    'database': 'dwtestdb',
    'user': db_username,
    'password': db_password
}

# Add the datetime column to the DataFrame
df['datetime'] = df.index

# Connect to PostgreSQL
try:
    connection = psycopg2.connect(**db_config)
    cursor = connection.cursor()
    
    # Create table
    create_table_query = """
    CREATE TABLE IF NOT EXISTS stock_data (
        id SERIAL PRIMARY KEY,
        open NUMERIC(10, 5),
        high NUMERIC(10, 5),
        low NUMERIC(10, 5),
        close NUMERIC(10, 5),
        volume BIGINT,
        datetime DATE
    );
    """
    
    cursor.execute(create_table_query)
    connection.commit()
    print("Table created successfully or already exists.")
    
    # Prepare data for insertion
    insert_query = """
    INSERT INTO stock_data (open, high, low, close, volume, datetime)
    VALUES %s
    """
    
    # Convert DataFrame to list of tuples
    data_tuples = list(df.itertuples(index=False, name=None))
    
    # Execute batch insert
    execute_values(
        cursor,
        insert_query,
        data_tuples,
        template=None,
        page_size=100
    )
    
    connection.commit()
    print(f"Successfully inserted {len(data_tuples)} rows")
    
    # Verify insertion
    cursor.execute("SELECT COUNT(*) FROM stock_data")
    count = cursor.fetchone()[0]
    print(f"Total rows in table: {count}")
    
except Exception as e:
    print(f"Error: {e}")
    connection.rollback()
    
finally:
    if connection:
        cursor.close()
        connection.close()

Table created successfully or already exists.
Successfully inserted 30 rows
Total rows in table: 30


# Historical data

Due to the limitations of the free plan, we have to get historical data using other means, we decided Kaggle is the best way.

In [1]:
pip install kagglehub

Defaulting to user installation because normal site-packages is not writeable
Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Collecting pyyaml (from kagglehub)
  Downloading PyYAML-6.0.2-cp313-cp313-win_amd64.whl.metadata (2.1 kB)
Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
Downloading PyYAML-6.0.2-cp313-cp313-win_amd64.whl (156 kB)
Installing collected packages: pyyaml, kagglehub
Successfully installed kagglehub-0.3.12 pyyaml-6.0.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Inflation data

In [7]:
os.getcwd()

'c:\\nextcloud\\Studia - PW\\semestr 6\\hurtownie danych\\MarketMiner\\MarketMiner'

In [None]:


# Download latest version
dataset_path = kagglehub.dataset_download("sazidthe1/global-inflation-data")
kaggle_raw_dir = "data/raw/kaggle/inflation_world/"
print("Path to dataset files:", dataset_path)

if os.path.exists(kaggle_raw_dir):
    shutil.rmtree(kaggle_raw_dir)

# Copy the entire directory tree
shutil.copytree(dataset_path, kaggle_raw_dir)

print(f"Dataset downloaded and copied to: {kaggle_raw_dir}")

Path to dataset files: C:\Users\admin\.cache\kagglehub\datasets\sazidthe1\global-inflation-data\versions\1
Dataset downloaded and copied to: data/raw/kaggle/inflation_world/


In [5]:
import kagglehub
import shutil
import os

# Download latest version
dataset_path = kagglehub.dataset_download("varpit94/us-inflation-data-updated-till-may-2021")
kaggle_raw_dir = "/data/raw/kaggle/inflation/"
print("Path to dataset files:", dataset_path)

if os.path.exists(kaggle_raw_dir):
    shutil.rmtree(kaggle_raw_dir)

# Copy the entire directory tree
shutil.copytree(dataset_path, kaggle_raw_dir)

print(f"Dataset downloaded and copied to: {kaggle_raw_dir}")

Path to dataset files: C:\Users\admin\.cache\kagglehub\datasets\varpit94\us-inflation-data-updated-till-may-2021\versions\5
Dataset downloaded and copied to: /data/raw/kaggle/inflation/


## NYSE data

In [None]:
# Download the dataset to a specific directory
dataset_path = kagglehub.dataset_download("svaningelgem/nyse-100-daily-stock-prices")

# If you want to copy/move it to your desired location
kaggle_raw_dir = "/data/raw/kaggle/nyse/"

# Remove target directory if it exists, then copy everything
if os.path.exists(kaggle_raw_dir):
    shutil.rmtree(kaggle_raw_dir)

# Copy the entire directory tree
shutil.copytree(dataset_path, kaggle_raw_dir)

print(f"Dataset downloaded and copied to: {kaggle_raw_dir}")

Dataset downloaded and copied to: /data/raw/kaggle/


## Insert sample

In [41]:
files = os.listdir(kaggle_raw_dir)

print("There are", len(files), "files in the directory")

# for file in files:
#     file_path = os.path.join(kaggle_raw_dir, file)
#     print(file, file_path)

sample_file = files[0]
sample_file_path = os.path.join(kaggle_raw_dir, sample_file)
pd.read_csv(sample_file_path).head().columns


There are 101 files in the directory


Index(['ticker', 'date', 'open', 'high', 'low', 'close'], dtype='object')

In [42]:
print(f"Reading CSV file from: {sample_file_path}")
df = pd.read_csv(sample_file_path)

# Preview the data
print("DataFrame shape:", df.shape)
print("DataFrame columns:", df.columns)
print("First 5 rows:")
print(df.head())

Reading CSV file from: /data/raw/kaggle/ABBV.csv
DataFrame shape: (3077, 6)
DataFrame columns: Index(['ticker', 'date', 'open', 'high', 'low', 'close'], dtype='object')
First 5 rows:
  ticker        date     open     high      low    close
0   ABBV  2013-01-02  34.9200  35.4000  34.1000  35.1200
1   ABBV  2013-01-03  35.0000  35.0000  34.1600  34.8300
2   ABBV  2013-01-04  22.9947  23.1732  22.7505  22.8398
3   ABBV  2013-01-07  22.6809  23.5459  22.6809  22.8888
4   ABBV  2013-01-08  22.7760  23.0095  22.1581  22.3916


In [None]:


# Connect to the PostgreSQL database
try:
    conn = psycopg2.connect(**db_config)
    cursor = conn.cursor()
    print("Successfully connected to PostgreSQL database")

    # Create the FactStock table
    create_table_query = '''
    CREATE TABLE IF NOT EXISTS FactStock (
        id SERIAL PRIMARY KEY,
        ticker VARCHAR(10) NOT NULL,
        date DATE NOT NULL,
        open NUMERIC(10, 2),
        high NUMERIC(10, 2),
        low NUMERIC(10, 2),
        close NUMERIC(10, 2)
    );
    '''
    cursor.execute(create_table_query)
    conn.commit()
    print("FactStock table created or already exists")

    # Select only the required columns
    if set(['ticker', 'date', 'open', 'high', 'low', 'close']).issubset(set(df.columns)):
        df_subset = df[['ticker', 'date', 'open', 'high', 'low', 'close']]
        
        # Convert date column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(df_subset['date']):
            df_subset['date'] = pd.to_datetime(df_subset['date'])
            
        # Convert dataframe to list of tuples for faster insertion
        data_tuples = list(df_subset.itertuples(index=False, name=None))
        
        # Insert the data in batches
        insert_query = '''
        INSERT INTO FactStock (ticker, date, open, high, low, close)
        VALUES %s
        '''
        
        # Use execute_values for faster insertion
        execute_values(cursor, insert_query, data_tuples)
        conn.commit()
        
        # Count rows to verify
        cursor.execute("SELECT COUNT(*) FROM FactStock")
        row_count = cursor.fetchone()[0]
        print(f"Successfully imported {row_count} rows into FactStock table")
    else:
        print("Error: Required columns not found in the CSV file")
        print(f"Expected: ['ticker', 'date', 'open', 'high', 'low', 'close']")
        print(f"Found: {df.columns.tolist()}")

except Exception as e:
    print(f"Database Error: {e}")
    
finally:
    # Close the connection
    if 'conn' in locals() and conn is not None:
        cursor.close()
        conn.close()
        print("PostgreSQL connection closed")

Successfully connected to PostgreSQL database
FactStock table created or already exists
Successfully imported 3077 rows into FactStock table
PostgreSQL connection closed


## Insert all

In [44]:
def insert_stock_data(filepath):
    df = pd.read_csv(filepath)
    try:
        conn = psycopg2.connect(**db_config)
        cursor = conn.cursor()
        print("Successfully connected to PostgreSQL database")

        # Create the FactStock table
        create_table_query = '''
        CREATE TABLE IF NOT EXISTS FactStock (
            id SERIAL PRIMARY KEY,
            ticker VARCHAR(10) NOT NULL,
            date DATE NOT NULL,
            open NUMERIC(10, 2),
            high NUMERIC(10, 2),
            low NUMERIC(10, 2),
            close NUMERIC(10, 2)
        );
        '''
        cursor.execute(create_table_query)
        conn.commit()
        print("FactStock table created or already exists")

        # Select only the required columns
        if set(['ticker', 'date', 'open', 'high', 'low', 'close']).issubset(set(df.columns)):
            df_subset = df[['ticker', 'date', 'open', 'high', 'low', 'close']]
            
            # Convert date column to datetime if it's not already
            if not pd.api.types.is_datetime64_any_dtype(df_subset['date']):
                df_subset['date'] = pd.to_datetime(df_subset['date'])
                
            # Convert dataframe to list of tuples for faster insertion
            data_tuples = list(df_subset.itertuples(index=False, name=None))
            
            # Insert the data in batches
            insert_query = '''
            INSERT INTO FactStock (ticker, date, open, high, low, close)
            VALUES %s
            '''
            
            # Use execute_values for faster insertion
            execute_values(cursor, insert_query, data_tuples)
            conn.commit()
            
            # Count rows to verify
            cursor.execute("SELECT COUNT(*) FROM FactStock")
            row_count = cursor.fetchone()[0]
            print(f"Successfully imported {row_count} rows into FactStock table")
        else:
            print("Error: Required columns not found in the CSV file")
            print(f"Expected: ['ticker', 'date', 'open', 'high', 'low', 'close']")
            print(f"Found: {df.columns.tolist()}")

    except Exception as e:
        print(f"Database Error: {e}")
        
    finally:
        # Close the connection
        if 'conn' in locals() and conn is not None:
            cursor.close()
            conn.close()
            print("PostgreSQL connection closed")

In [46]:
for file in files:
    if file.endswith(".csv"):
        # Check if the file is a CSV file
        print(f"Processing {file}...")
        file_path = os.path.join(kaggle_raw_dir, file)
        print(file, file_path)
        insert_stock_data(file_path)

Processing ABBV.csv...
ABBV.csv /data/raw/kaggle/ABBV.csv
Successfully connected to PostgreSQL database
FactStock table created or already exists
Successfully imported 225910 rows into FactStock table
PostgreSQL connection closed
Processing ABT.csv...
ABT.csv /data/raw/kaggle/ABT.csv
Successfully connected to PostgreSQL database
FactStock table created or already exists
Successfully imported 237261 rows into FactStock table
PostgreSQL connection closed
Processing ACN.csv...
ACN.csv /data/raw/kaggle/ACN.csv
Successfully connected to PostgreSQL database
FactStock table created or already exists
Successfully imported 243219 rows into FactStock table
PostgreSQL connection closed
Processing AMT.csv...
AMT.csv /data/raw/kaggle/AMT.csv
Successfully connected to PostgreSQL database
FactStock table created or already exists
Successfully imported 250032 rows into FactStock table
PostgreSQL connection closed
Processing ANET.csv...
ANET.csv /data/raw/kaggle/ANET.csv
Successfully connected to Postg

# Helper functions

## Get stock data for month

In [18]:
def get_stock_data(symbol, interval, start_date, end_date, custom_name=None):
    """
    Fetch stock data from Twelve Data API.
    
    Parameters:
    symbol (str): Stock symbol.
    interval (str): Time interval (e.g., '1min', '1day').
    start_date (str): Start date in 'YYYY-MM-DD' format.
    end_date (str): End date in 'YYYY-MM-DD' format.
    
    Returns:
    pd.DataFrame: DataFrame containing stock data.
    """
    ts = td.time_series(
        symbol=symbol,
        interval=interval,
        start_date=start_date,
        end_date=end_date,
        timezone="America/New_York"
    )

    df = ts.as_pandas()

    file_name = f"data/raw/{symbol}_{interval}_{start_date}_{end_date}.csv"
    if custom_name:
        file_name = f"data/raw/{custom_name}.csv"

    df.to_csv(file_name, index=True)
    
    return df

In [19]:
get_stock_data("BA", "1day", "2024-01-01", "2024-02-28")

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-02-27,200.92999,202.0,200.03999,201.39999,3932900
2024-02-26,201.00999,202.75,200.0,200.53999,4807200
2024-02-23,200.99001,202.07001,197.14999,200.83,7433400
2024-02-22,202.0,204.10001,200.39999,201.5,6513500
2024-02-21,202.89999,203.63,201.21001,201.57001,4179800
2024-02-20,203.55,205.58,202.71001,203.37,5248400
2024-02-16,204.88,205.05,202.81,203.89,5975900
2024-02-15,204.12,206.55,203.39999,205.33,5309100
2024-02-14,205.95,206.58,202.21001,203.38,6503000
2024-02-13,206.89999,207.81,202.33,204.46001,8263600


In [None]:
# Fetch daily time series data for SPY from 2024-01-01 to 2024-12-31
ts = td.time_series(
    symbol="BA",
    interval="1day",
    start_date="2024-01-01",
    end_date="2024-12-31",
    timezone="America/New_York"
)

# Convert the time series data to a pandas DataFrame
df = ts.as_pandas()

# Display the first few rows of the DataFrame
print(df.head())

# Optional: Save the DataFrame to a CSV file
df.to_csv("data/raw/BA_2024_daily.csv", index=True)

## Insert stock data into db