In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
start_date = "1980-01-01"
end_date = "2023-01-01"

stock_data = yf.download('^GSPC', start=start_date, end=end_date)

stock_data['Daily_Return'] = stock_data['Close'].pct_change()
stock_data.dropna(inplace=True)
df = stock_data[['Close', 'Daily_Return']]


[*********************100%***********************]  1 of 1 completed


In [None]:
df['State'] = 'Same'

df['Close'] = df['Close'].astype(int)

df.loc[df['Close'] > df['Close'].shift(), 'State'] = 'Up'
df.loc[df['Close'] < df['Close'].shift(), 'State'] = 'Down'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['State'] = 'Same'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Close'] = df['Close'].astype(int)


In [None]:
df['State'].value_counts()

Up      5152
Down    4470
Same    1220
Name: State, dtype: int64

In [None]:
df['Next_State'] = df['State'].shift(-1)

df = df.dropna()

transition_counts = df.groupby(['State', 'Next_State']).size().reset_index(name='Count')

transition_matrix = transition_counts.pivot(index='State', columns='Next_State', values='Count')

transition_matrix = transition_matrix.fillna(0)

transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0)

transition_matrix_np = transition_matrix.to_numpy()

print("Transition Matrix:")
print(transition_matrix)


Transition Matrix:
Next_State      Down      Same        Up
State                                   
Down        0.404565  0.094652  0.500783
Same        0.349180  0.214754  0.436066
Up          0.434006  0.103649  0.462345


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next_State'] = df['State'].shift(-1)


In [None]:
def predict_next_states(current_state, transition_matrix, n=1):

    current_state_row = transition_matrix.loc[current_state]

    probabilities = current_state_row.values

    next_states = current_state_row.index

    predicted_next_states = np.random.choice(next_states, size=n, p=probabilities)

    return predicted_next_states

In [None]:
current_state = 'Up'
predicted_next_states = predict_next_states(current_state, transition_matrix, n=5)

print(f"Current state: {current_state}, Predicted next states: {predicted_next_states}")


Current state: Up, Predicted next states: ['Same' 'Down' 'Up' 'Down' 'Down']


In [None]:
actual_data = df.copy()

In [None]:
df.reset_index(inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

initial_state = df.loc['2022-01-03', 'State']

# Function to predict the next state based on the transition matrix
def predict_next_state(current_state, transition_matrix):
    current_state_row = transition_matrix.loc[current_state]
    probabilities = current_state_row.values
    next_states = current_state_row.index
    predicted_next_state = np.random.choice(next_states, p=probabilities)
    return predicted_next_state

date_range = pd.date_range(start='2022-01-01', end='2023-01-01', freq='D')
predictions = [initial_state]

for i in range(1, len(date_range)):
    next_state = predict_next_state(predictions[i - 1], transition_matrix)
    predictions.append(next_state)

predictions_df = pd.DataFrame({'Predicted_State': predictions}, index=date_range)


actual_data.reset_index(inplace=True)
actual_data['Date'] = pd.to_datetime(actual_data['Date'])
actual_data = actual_data.set_index('Date')

actual_data_2022 = actual_data['2022-01-03':'2022-12-31']

comparison_df = actual_data_2022.join(predictions_df)

print(comparison_df)


            Close  Daily_Return State Next_State Predicted_State
Date                                                            
2022-01-03   4796      0.006374    Up       Down              Up
2022-01-04   4793     -0.000630  Down       Down              Up
2022-01-05   4700     -0.019393  Down       Down            Down
2022-01-06   4696     -0.000964  Down       Down            Down
2022-01-07   4677     -0.004050  Down       Down            Same
...           ...           ...   ...        ...             ...
2022-12-22   3822     -0.014452  Down         Up              Up
2022-12-23   3844      0.005868    Up       Down              Up
2022-12-27   3829     -0.004050  Down       Down            Down
2022-12-28   3783     -0.012021  Down         Up              Up
2022-12-29   3849      0.017461    Up       Down            Same

[250 rows x 5 columns]


In [None]:
merged_data = actual_data_2022.merge(predictions_df, left_index=True, right_index=True, suffixes=('_Actual', '_Predicted'))

In [None]:
merged_data['Correct_Prediction'] = merged_data['Next_State'] == merged_data['Predicted_State']

In [None]:
y_true = (merged_data['Next_State'] == merged_data['Predicted_State']).astype(int)
y_pred = (merged_data['Predicted_State'] == merged_data['Predicted_State']).astype(int)

In [None]:
# Calculate and print metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

Accuracy: 0.45
Precision: 0.45
Recall: 1.00
F1-score: 0.62
