In [None]:
!pip install pandas-ta dcor pingouin


In [57]:
import yfinance as yf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import pandas_ta as ta
import dcor
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import numpy as np
import pingouin as pg
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [114]:
ticker="AAPL"
stock = yf.Ticker(ticker)

# get historical market data
df = stock.history(period="max")
df.drop(['Dividends', 'Stock Splits'], axis=1, inplace=True)
# Create a Date column
df['Date'] = df.index
# Drop the Date as index
df.reset_index(drop=True, inplace=True)

# DF currently has open, high, low, close, volume, date columns,


In [115]:
df['RSI14'] = ta.rsi(df['Close'], length=14)
df['RSI6'] = ta.rsi(df['Close'], length=6)
df['RSI24'] = ta.rsi(df['Close'], length=24)

macd = ta.macd(df['Close'])
df = pd.concat([df, macd], axis=1)

df['EMA5'] = ta.ema(df['Close'], length=5)
df['EMA10'] = ta.ema(df['Close'], length=10)
df['EMA20'] = ta.ema(df['Close'], length=20)
df['EMA100'] = ta.ema(df['Close'], length=100)

supertrend = ta.supertrend(df['High'], df['Low'], df['Close'], length=10, multiplier=3)
df = pd.concat([df, supertrend], axis=1)


# create columns for rsi, macd, ema, supertrend

In [116]:

# removing all empty dates
# build complete timeline from start date to end date
dt_all = pd.date_range(start=df.index[0],end=df.index[-1])
# retrieve the dates that are in the original datset
dt_obs = [d.strftime("%Y-%m-%d") for d in pd.to_datetime(df.index)]
# define dates with missing values
dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d").tolist() if not d in dt_obs]

In [117]:
df.drop(['SUPERTs_10_3.0','SUPERTl_10_3.0'], axis = 1, inplace=True)


Kendall Tau Correlation: Another non-parametric correlation measure, the Kendall Tau correlation coefficient, evaluates the strength and direction of association between two ranked variables. It's particularly useful when the data set is small, and it's less sensitive to errors in the data compared to Pearson and Spearman.

Distance Correlation: This measure assesses both linear and non-linear associations between two random variables or datasets. Unlike Pearson, which can only capture linear relationships, distance correlation is capable of detecting more complex relationships.

Partial Correlation: This measures the degree of association between two variables, with the effect of a set of controlling random variables removed. It's useful when you want to find the correlation between two variables while controlling for the effect of one or more other variables.

Pearson correlation coefficient is a correlation coefficient that measures linear correlation between two sets of data. It is the ratio between the covariance of two variables and the product of their standard deviations; thus, it is essentially a normalized measurement of the covariance, such that the result always has a value between −1 and 1. As with covariance itself, the measure can only reflect a linear correlation of variables, and ignores many other types of relationships or correlations.

Pearson correlation between indicators

In [129]:
features = ['Close', 'RSI14', 'RSI6', 'RSI24', 'MACD_12_26_9', 'EMA5', 'EMA10', 'EMA20', 'EMA100', 'SUPERT_10_3.0']

df = df.iloc[100:]
nan_in_df = df.isna().sum()

# Calculating the correlation matrix
pcc = df[features].corr()


# Extracting the correlations of all indicators with the 'Close' price
pcc = pcc['Close'].sort_values(ascending=False)

# Display the correlation values
print(pcc)



Open                0
High                0
Low                 0
Close               0
Volume              0
Date                0
RSI14               0
RSI6                0
RSI24               0
MACD_12_26_9        0
MACDh_12_26_9       0
MACDs_12_26_9       0
EMA5                0
EMA10               0
EMA20               0
EMA100              0
SUPERT_10_3.0       0
SUPERTd_10_3.0      0
Min_5day            0
Max_5day            0
Movement_Label      0
Price_Difference    0
Label               0
dtype: int64
Close            1.000000
EMA5             0.999852
EMA10            0.999633
EMA20            0.999200
SUPERT_10_3.0    0.997696
EMA100           0.996609
MACD_12_26_9     0.305459
RSI24            0.095733
RSI14            0.084348
RSI6             0.065776
Name: Close, dtype: float64


Kendal correlation between indicators

In [None]:
kendall = df[features].corr(method='kendall')
kendall = kendall['Close'].sort_values(ascending=False)
print(kendall)


Distance correlation between indicators

In [None]:

distance_correlations = {}

# Calculate distance correlation for each feature with 'Close'
for feature in features:
    if feature != 'Close':
        distance_corr = dcor.distance_correlation(df['Close'], df[feature])
        distance_correlations[feature] = distance_corr

# Sort the dictionary by correlation value
sorted_distance_correlations = dict(sorted(distance_correlations.items(), key=lambda item: item[1], reverse=True))

# Display the distance correlation values
print(sorted_distance_correlations)

In [None]:

partial = pd.DataFrame(columns=['Variable', 'Partial Correlation'])

# Calculate partial correlation for each feature with 'Close', controlling for the other features
for feature in features:
    if feature != 'Close':
        # Remaining features to control for
        control_features = [f for f in features if f not in ['Close', feature]]
        # Calculate partial correlation
        pcorr = pg.partial_corr(data=df, x='Close', y=feature, covar=control_features)
        partial = pd.concat([partial, pd.DataFrame({'Variable': [feature], 'Partial Correlation': [pcorr['r'].values[0]]})], ignore_index=True)

# Sort the DataFrame by correlation value
partial = partial.sort_values(by='Partial Correlation', ascending=False)

# Display the partial correlation values
print(partial)

In [122]:
def label_movement(row, pct_threshold=3):
    min_price = row['Min_5day']
    max_price = row['Max_5day']
    close_price = row['Close']

    # Calculate percentage change
    upper_bound = close_price * (1 + pct_threshold / 100)
    lower_bound = close_price * (1 - pct_threshold / 100)

    if max_price <= upper_bound and min_price >= lower_bound:
        return 0  # Sideways
    elif max_price > upper_bound:
        return 1  # Up
    else:
        return -1  # Down

# Calculate rolling min and max
df['Min_5day'] = df['Close'].rolling(window=5).min()
df['Max_5day'] = df['Close'].rolling(window=5).max()

# Apply the label function
df['Movement_Label'] = df.apply(label_movement, axis=1)

# Drop the first 4 rows as they don't have complete 5-day windows
df = df.iloc[4:]


In [124]:
threshold = 0.01  # Threshold to define 'sideways' movement

df['Price_Difference'] = df['Close'].diff()
df['Label'] = df['Price_Difference'].apply(lambda x: 1 if x > threshold else (-1 if x < -threshold else 0))


features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Capital Gains',
       'RSI14', 'RSI6', 'RSI24', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'EMA5', 'EMA10', 'EMA20', 'EMA100', 'SUPERT_10_3.0',
       'SUPERTd_10_3.0','Min_5day', 'Max_5day','Movement_Label']


df.dropna(subset=features + ['Label'], inplace=True)

# Normalize features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])


In [125]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [126]:

X = df[features]
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model.fit(X_train, y_train)


In [128]:

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

          -1       0.65      0.63      0.64       737
           0       0.86      0.92      0.89      1578
           1       0.73      0.66      0.69       907

    accuracy                           0.78      3222
   macro avg       0.75      0.74      0.74      3222
weighted avg       0.78      0.78      0.78      3222

