In [1]:
!pip install ripser seaborn




[notice] A new release of pip is available: 23.3.1 -> 24.1.1
[notice] To update, run: C:\Users\buddh\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ripser import Rips
import persim
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Define stock labels with their corresponding date ranges and stock tickers
stock_data = {
    "V shaped recovery": {
        "stocks": [ 'CI', 'DXCM', 'NFLX', 'CTLT', 'MRK', 'TMO', 'ABT', 'DHR', 'JNJ', 'SUNPHARMA.NS', 'TORNTPHARM.NS', 'CIPLA.NS', 'ZYDUSLIFE.NS', 'PFIZER.NS', 'LUPIN.NS', 'BRITANNIA.NS', 'DABUR.NS', 'DABUR.NS', 'ABBOTINDIA.NS', 'AUROPHARMA.NS', 'BIOCON.NS', 'DRREDDY.NS', 'GRANULES.NS', 'LAURUSLABS.NS', 'SYNGENE.NS', 'NATCOPHARM.NS', 'SANOFI.NS', 'AJANTPHARM.NS', 'APLLTD.NS', 'IGL.NS', 'RELIANCE.NS', 'MUTHOOTFIN.NS', 'DEN.NS', 'BALAJITELE.NS'],
        "start": "2020-01-01",
        "end": "2020-05-30"
    },
    "U shaped recovery": {
        "stocks": ['IFCI.NS', 'ADANIENT.NS', 'UBL.NS', 'SAREGAMA.NS', 'ASHOKLEY.NS', 'BOSCHLTD.NS', 'EICHERMOT.NS', 'GLENMARK.NS', 'AMBER.NS', 'STAR.NS', 'IPCALAB.NS', 'VGUARD.NS', 'WHIRLPOOL.NS', 'PFC.NS'],
        "start": "2020-01-01",
        "end": "2020-11-30"
    },
    "W shaped recovery": {
        "stocks": ['SOUTHBANK.NS', 'UCOBANK.NS', 'PSB.NS', 'CENTRALBK.NS', 'JBCHEPHARM.NS', 'SOLARA.NS', 'GTPL.NS', 'PFE'],
        "start": "2020-01-01",
        "end": "2020-06-30"
    },
    "Swoosh shaped recovery": {
        "stocks": [ 'HEROMOTOCO.NS', 'M&M.NS', 'TFCILTD.NS', 'BAJAJCON.NS', 'GODREJCP.NS', 'ICICIBANK.NS', 'HDFCBANK.NS', 'IDFCFIRSTB.NS', 'FEDERALBNK.NS', 'CANBK.NS', 'BALRAMCHIN.NS', 'COLPAL.NS', 'RADICO.NS', 'TATACONSUM.NS', 'VBL.NS', 'NETWORK18.NS', 'TV18BRDCST.NS', 'APOLLOTYRE.NS', 'BAJAJ-AUTO.NS', 'MARUTI.NS', 'CROMPTON.NS', 'VEDL.NS', 'KAJARIACER.NS', 'TITAN.NS', 'INFY.NS', 'HCLTECH.NS', 'TCS.NS', 'TECHM.NS'],
        "start": "2020-01-01",
        "end": "2021-01-01"
    }
}


In [34]:

# Extract stock indices, labels, and date ranges
index_names = []
labels = []
date_ranges = {}

for label, data in stock_data.items():
    index_names.extend(data["stocks"])
    labels.extend([label] * len(data["stocks"]))
    for stock in data["stocks"]:
        date_ranges[stock] = (data["start"], data["end"])

# Function to normalize a dataframe column
def normalize(df, column):
    return df[column] / df[column].max()

# Function to create delay embedding
def delay_embedding(data, dimension, lag):
    n_points = len(data) - (dimension - 1) * lag
    if n_points <= 0:
        raise ValueError("Time series is too short for the given dimension and lag")
    embedded_data = np.empty((n_points, dimension))
    for i in range(n_points):
        embedded_data[i] = data[i:i + dimension * lag:lag]
    return embedded_data

# Parameters for delay embedding
embedding_dimension = 3  
embedding_lag = 4  

# Instantiate Vietoris-Rips solver
rips = Rips(maxdim=2)

# Initialize a list to store diagrams for each stock
all_diagrams = []

# Iterate over each stock and download data based on label-specific date ranges
for stock, label in zip(index_names, labels):
    start_date_string, end_date_string = date_ranges[stock]

    # Pull data from Yahoo Finance
    raw_data = yf.download(stock, start=start_date_string, end=end_date_string)

    # Normalize 'Adj Close' column
    #raw_data['Adj Close'] = normalize(raw_data, 'Adj Close')

    # Prepare data using delay embedding
    embedded_data = delay_embedding(raw_data['Adj Close'].values, embedding_dimension, embedding_lag)

    # Compute the persistence diagram
    dgm = rips.fit_transform(embedded_data)
    all_diagrams.append(dgm)

# Compute Wasserstein distances between all pairs of persistence diagrams
num_stocks = len(index_names)
wasserstein_dists = np.zeros((num_stocks, num_stocks))

for i in range(num_stocks):
    for j in range(num_stocks):
        if i != j:
            wasserstein_dists[i, j] = persim.wasserstein(all_diagrams[i][0], all_diagrams[j][0], matching=False)



Rips(maxdim=2, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[******************



In [35]:
wasserstein_dists.shape

(84, 84)

In [36]:
embedded_data.shape

(243, 3)

In [37]:
from sklearn.preprocessing import scale
# Prepare data for kNN classifier
# Flatten the distance matrix into feature vectors
X = scale(wasserstein_dists)
y = labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Use GridSearchCV to find the optimal value of k
param_grid = {'n_neighbors': np.arange(1, 10)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train, y_train)

# Print the optimal value of k
optimal_k = knn_cv.best_params_['n_neighbors']
print(f"Optimal k: {optimal_k}")

# Use the optimal value of k to train the final model
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)
# Predict and evaluate the classifier on the test set
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Optimal k: 3
Accuracy: 0.65


In [39]:
# Function to predict labels for new stocks
def predict_new_stocks(new_stocks, start_date, end_date):
    new_diagrams = []
    for stock in new_stocks:
        raw_data = yf.download(stock, start=start_date, end=end_date)
        raw_data['Adj Close'] = normalize(raw_data, 'Adj Close')
        embedded_data = delay_embedding(raw_data['Adj Close'].values, embedding_dimension, embedding_lag)
        dgm = rips.fit_transform(embedded_data)
        new_diagrams.append(dgm)

    num_new_stocks = len(new_stocks)
    new_wasserstein_dists = np.zeros((num_new_stocks, num_stocks))

    for i in range(num_new_stocks):
        for j in range(num_stocks):
            new_wasserstein_dists[i, j] = persim.wasserstein(new_diagrams[i][0], all_diagrams[j][0], matching=False)
            new_wasserstein_dists = scale(new_wasserstein_dists)

    new_predictions = knn.predict(new_wasserstein_dists)
    return new_predictions

# Predicting the recovery profile of stocks based on earlier learning: the first five have V shaped recovery. The next five have U shaped recovery. The next 2 have W shaped and next five have swoosh shaped recovery.
test_stocks = ['VRTX', 'MANH', 'PODD', 'SRPT', 'ORCL','CELH', 'ZBRA', 'HATHWAY.NS', 'PFOCUS.NS', 'EROSMEDIA.NS', 'ALKEM.NS', 'HINDUNILVR.NS','ENPH', 'ADBE', 'AMAT', 'SSNC', 'BALKRISIND.NS' ] 
test_start_date = "2020-01-01"
test_end_date = "2020-05-30"
predictions = predict_new_stocks(test_stocks, test_start_date, test_end_date)
print(f"Predictions for test stocks: {predictions}")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******



Predictions for test stocks: ['V shaped recovery' 'V shaped recovery' 'V shaped recovery'
 'V shaped recovery' 'V shaped recovery' 'V shaped recovery'
 'V shaped recovery' 'V shaped recovery' 'V shaped recovery'
 'V shaped recovery' 'V shaped recovery' 'V shaped recovery'
 'V shaped recovery' 'V shaped recovery' 'V shaped recovery'
 'V shaped recovery' 'Swoosh shaped recovery']




In [25]:
new_wasserstein_dists

NameError: name 'new_wasserstein_dists' is not defined

In [24]:
new_wasserstein_dists

NameError: name 'new_wasserstein_dists' is not defined