<a href="https://colab.research.google.com/github/ChiraagNadig/Quantum-ML-Research/blob/main/QTS_QGAF_Dataset_Curator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install qiskit qiskit-aer yfinance matplotlib pandas numpy --quiet

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector, Operator
import os
import warnings
warnings.filterwarnings('ignore')

# Function to collect stock data using yfinance
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    stock_data = stock_data.reset_index()
    return stock_data

# Preprocessing function with corrections
def preprocess_stock_data(stock_data):
    # Addressing missing values
    stock_data = stock_data.fillna(method='ffill')

    # Computing daily returns
    stock_data['Daily_Return'] = stock_data['Close'].pct_change()
    stock_data['Daily_Return'] = stock_data['Daily_Return'].fillna(0)

    # Additional features can be added here if needed
    return stock_data

# Function to compute phi for a data segment
def compute_phi(segment):
    # Normalize the data to [-1,1]
    min_val = np.min(segment)
    max_val = np.max(segment)

    if max_val - min_val == 0:
        normalized_segment = np.zeros_like(segment)
    else:
        normalized_segment = 2 * (segment - min_val) / (max_val - min_val) - 1

    # Clip values to [-1,1] to avoid numerical issues
    normalized_segment = np.clip(normalized_segment, -1, 1)

    # Compute phi
    phi = np.arccos(normalized_segment)  # Angles in radians
    return phi

# Function to compute QGAF matrix using Statevector method
def compute_qgaf_matrix(phi, difference=False):
    n = len(phi)
    qgaf_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            a = phi[i]
            b = phi[j]
            qc = QuantumCircuit(1)
            qc.ry(2 * a, 0)
            if difference:
                qc.ry(-2 * b, 0)
            else:
                qc.ry(2 * b, 0)
            state = Statevector.from_instruction(qc)
            expectation_Z = state.expectation_value(Operator.from_label('Z'))
            cos_ab = expectation_Z.real  # cos(a ± b)
            qgaf_matrix[i, j] = cos_ab
    return qgaf_matrix

# Function to create and save QGAF images
def create_qgaf_image(qgaf_matrix, date, label, ticker, set_type, output_dir, difference=False):
    # Create directory if it doesn't exist
    dir_name = 'QGADF' if difference else 'QGASF'
    dir_path = os.path.join(output_dir, set_type, dir_name)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    # Save the image
    file_name = f"{ticker}_{date}_{label}.png"
    file_path = os.path.join(dir_path, file_name)
    plt.imsave(file_path, qgaf_matrix, cmap='rainbow', origin='lower', vmin=-1, vmax=1)
    plt.close()  # Close the figure to save memory

# Parameters
# tickers = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA']  # List of stock tickers
tickers = ['BA', 'DOW', 'IBM', 'JPM', 'PG']
start_date = '2020-01-01'  # Shortened date range for efficiency
end_date = '2022-12-31'
n = 20  # Reduced length of time series segment for efficiency
output_dir = 'QGAF_images'

# Create output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop over each ticker
for ticker in tickers:
    print(f"Processing {ticker}...")
    # Get stock data
    stock_data = get_stock_data(ticker, start_date, end_date)
    # Check if data is available
    if stock_data.empty:
        print(f"No data found for {ticker}. Skipping.")
        continue
    # Preprocess data
    stock_data = preprocess_stock_data(stock_data)
    # Total number of data points
    total_points = len(stock_data)
    # Loop over segments (sample every 5th segment for efficiency)
    for idx in range(n, total_points - 1, 5):
        segment = stock_data['Daily_Return'].values[idx - n:idx]
        # Compute phi
        phi = compute_phi(segment)
        # Labeling based on future return
        future_return = stock_data['Daily_Return'].values[idx + 1]
        label = 'up' if future_return > 0 else 'down'
        # Date for the file name
        date = stock_data['Date'].iloc[idx].strftime('%Y-%m-%d')
        # Split into train, validation, and test sets
        year = stock_data['Date'].iloc[idx].year
        if year == 2022:
            set_type = 'test'
        elif year == 2021:
            set_type = 'validation'
        else:
            set_type = 'train'
        # Compute QGASF matrix
        qgasf_matrix = compute_qgaf_matrix(phi, difference=False)
        # Compute QGADF matrix
        qgadf_matrix = compute_qgaf_matrix(phi, difference=True)
        # Create and save QGASF image
        create_qgaf_image(qgasf_matrix, date, label, ticker, set_type, output_dir, difference=False)
        # Create and save QGADF image
        create_qgaf_image(qgadf_matrix, date, label, ticker, set_type, output_dir, difference=True)
    print(f"Completed {ticker}.")

print("Dataset creation completed.")

# --- ADDITIONAL CODE TO ZIP AND DOWNLOAD THE FOLDER ---

# Zip the folder
print("Zipping the folder...")
import shutil
shutil.make_archive('QGAF_images', 'zip', output_dir)

# Download the zipped folder
from google.colab import files
print("Downloading the zipped folder...")
files.download('QGAF_images.zip')


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.5/108.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hProcessing BA...


[*********************100%***********************]  1 of 1 completed


Completed BA.
Processing DOW...


[*********************100%***********************]  1 of 1 completed


Completed DOW.
Processing IBM...


[*********************100%***********************]  1 of 1 completed


Completed IBM.
Processing JPM...


[*********************100%***********************]  1 of 1 completed


Completed JPM.
Processing PG...


[*********************100%***********************]  1 of 1 completed


Completed PG.
Dataset creation completed.
Zipping the folder...
Downloading the zipped folder...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>