In [1]:
import pandas as pd
import numpy as np
import yfinance as yf

from scripts.preparation import download_data, preprocess

# Prep data

In [118]:
def extract_ticker(df_row):
    capital = 0
    for letter in df_row:
        if letter.isupper():
            capital += 1
        else:
            break
    
    answer = df_row[:capital-1]
    
    special_case = {"NVDANVIDI": "NVDA",
                    "QCOMQUALCOM": "QCOM",
                    "RTXRT": "RTX",
                    "CVSCV": "CVS",
                    "IACIA": "IAC",
                    "GGEVG": "GEV",
                    "EOGEO": "EOG",
                    "NXPINX": "NXPI",
                    "JPMJ": "JPM"
                    }
    if answer in special_case.keys():
        return special_case[answer]
    
    return answer

In [119]:
top100 = pd.read_html("https://www.tradingview.com/markets/stocks-usa/market-movers-active/")
ticker_list = top100[0]["Symbol"].apply(extract_ticker)
print([i for i in ticker_list if len(i) > 4])

ticker_object = download_data(ticker_list)

['CMCSA']


In [120]:
# russell_table = pd.read_html("https://en.wikipedia.org/wiki/Russell_1000_Index")
# ticker_list = list(russell_table[2]["Ticker"])

# ticker_object = download_data(ticker_list)

In [123]:
data_set = pd.DataFrame()
counter = 0

for ticker in ticker_list:
    if "." in ticker:
        continue
    
    stock_data = ticker_object.tickers[ticker].history(period="500d")
    stock_data, train_features, target = preprocess(stock_data)
    stock_data["ticker"] = ticker

    # print(stock_data["volatility20"].tail(50).mean())

    # if (stock_data["volatility5"].tail(50).mean()) < 0.001:
    #     continue

    data_set = pd.concat([data_set, stock_data[train_features + target + ["ticker"]]])

    counter += 1
    # if counter > 50:
    #     break
    if counter % 10 == 0:
        print(counter)

data_set[target] = data_set[target].astype(int)
data_set = data_set.sort_index()

10
20
30
40
50
60
70
80
90
100


In [124]:
data_set["short_target"].value_counts()

short_target
1    37094
2     2988
0     2480
Name: count, dtype: int64

In [125]:
data_set["long_target"].value_counts()

long_target
1    18791
2    13337
0    10434
Name: count, dtype: int64

# lightgbm

In [None]:
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [135]:
def multiclass_eval(y_true, y_pred, label):
    instance = np.where(y_true == label)[0]
    predict = np.where(y_pred == label)[0]
    correct = np.intersect1d(instance, predict) 

    print(f"----- For Class {label} -----")
    # Recall: how many instances is detected
    print(f"Recall: {len(correct) / (len(instance))}")

    # Precision: how many of the predicted are correct
    print(f"Precision: {len(correct) / (len(predict))}")

    return


In [141]:
params = {
    # "device": "gpu",
    # "n_estimators": 3000,
    "objective": "multiclass",
    "num_class": 3,
    "metric": "multi_logloss",
    "num_leaves": 128,
    "learning_rate": 0.005,
    # "feature_fraction": 0.8,
    # "bagging_fraction": 0.8,
    # "bagging_freq": 5,
    "verbose": -1,
}


In [142]:
# label_encoder = LabelEncoder()
# data_set["ticker"] = label_encoder.fit_transform(data_set["ticker"])
# encoding_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# target_type = "short_target"
target_type = "long_target"

X = data_set[train_features]
y = data_set[target_type]

train_size = int(0.8 * len(data_set))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
if target_type == "short_target":
    params.update(
        {
            "class_weight": {
                0: 10,
                1: 1,
                2: 10,
            }
        }
    )
elif target_type == "long_target":
    params.update(
        {
            "class_weight": {
                0: 2,
                1: 1,
                2: 2,
            }
        }
    )



lgb_model = lgb.LGBMRegressor(**params)
lgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.callback.early_stopping(stopping_rounds=500),
            #    lgb.callback.log_evaluation(period=100)
               ],
    )



In [165]:
def strict_pred(data):
    data = np.where(data > 0.45, 1, 0)
    data = np.argmax(data, axis=1)
    return data


In [166]:
y_pred = lgb_model.predict(X_test)
y_pred = strict_pred(y_pred)
multiclass_eval(y_test, y_pred, 2)

In [167]:
multiclass_eval(y_test, y_pred, 2)

----- For Class 2 -----
Recall: 0.0996
Precision: 0.36403508771929827


In [147]:
lgb_model.predict(X_test)[:20]

array([[0.36549683, 0.24179164, 0.39271153],
       [0.3138228 , 0.28135975, 0.40481745],
       [0.32127913, 0.26463572, 0.41408515],
       [0.31007175, 0.3062086 , 0.38371965],
       [0.32398965, 0.32312197, 0.35288838],
       [0.32574828, 0.29687532, 0.37737639],
       [0.28007689, 0.33747685, 0.38244626],
       [0.31320268, 0.27407147, 0.41272585],
       [0.31391529, 0.29272603, 0.39335867],
       [0.27278309, 0.2949751 , 0.43224181],
       [0.30686561, 0.35411683, 0.33901756],
       [0.27517148, 0.2709992 , 0.45382932],
       [0.270396  , 0.21766226, 0.51194174],
       [0.31524485, 0.28340164, 0.40135351],
       [0.31431292, 0.19772681, 0.48796028],
       [0.30788626, 0.26441112, 0.42770262],
       [0.33005261, 0.30331235, 0.36663504],
       [0.32001527, 0.32362407, 0.35636067],
       [0.30788759, 0.31089237, 0.38122004],
       [0.3073687 , 0.21084566, 0.48178563]])

In [144]:
y_pred = lgb_model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

for i in range(0, 3):
    multiclass_eval(y_test, y_pred, i)

----- For Class 0 -----
Recall: 0.19667590027700832
Precision: 0.29338842975206614
----- For Class 1 -----
Recall: 0.16017110266159695
Precision: 0.6217712177121771
----- For Class 2 -----
Recall: 0.754
Precision: 0.303103392828429


# tensorflow
https://www.tensorflow.org/tutorials/structured_data/time_series

In [221]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import Recall
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from sklearn.preprocessing import MinMaxScaler

In [264]:
# Preprocess data
# target_type = "short_target"
target_type = "long_target"
#

scaler = MinMaxScaler()
X = scaler.fit_transform(data_set[train_features])
y = data_set[target_type].values

if target_type == "short_target":
    class_weight = {0: 10, 1: 1, 2: 10}
elif target_type == "long_target":
    class_weight = {0: 2, 1: 1, 2: 2}



In [265]:
X.shape

(42562, 36)

In [266]:
# Reshape data for CNN
# X = X.reshape(1, X.shape[1], (X.shape[0]))

# Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


train_size = int(0.8 * len(data_set))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# X_train = X_train.reshape(1, X_train.shape[1], (X_train.shape[0]))
# X_test = X_test.reshape(1, X_test.shape[1], (X_test.shape[0]))


In [267]:
print("train x ", X_train.shape)
print("train label ", y_train.shape)
print("test x ", X_test.shape)
print("test label ", y_test.shape)

train x  (34049, 36)
train label  (34049,)
test x  (8513, 36)
test label  (8513,)


In [268]:
# Define the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dense(3, activation="softmax")) 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [269]:
# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, 
          epochs=10, 
          batch_size=32, 
          validation_data=(X_test, y_test),
          class_weight=class_weight)

Epoch 1/10


[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.3193 - loss: 1.7053 - val_accuracy: 0.2937 - val_loss: 1.1351
Epoch 2/10
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.3217 - loss: 1.6971 - val_accuracy: 0.3242 - val_loss: 1.1118
Epoch 3/10
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.3495 - loss: 1.6813 - val_accuracy: 0.3354 - val_loss: 1.1085
Epoch 4/10
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.3650 - loss: 1.6784 - val_accuracy: 0.3957 - val_loss: 1.0867
Epoch 5/10
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.3758 - loss: 1.6717 - val_accuracy: 0.3387 - val_loss: 1.1289
Epoch 6/10
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.3890 - loss: 1.6651 - val_accuracy: 0.4350 - val_loss: 1.0777
Epoch 7/10
[1m1065/1065[0

<keras.src.callbacks.history.History at 0x1c5095e7f70>

In [270]:
y_pred = model.predict(X_test)

[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [271]:
y_pred = model.predict(X_test)
y_pred = strict_pred(y_pred)
multiclass_eval(y_test, y_pred, 2)

[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
----- For Class 2 -----
Recall: 0.166
Precision: 0.35837651122625214


In [273]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

multiclass_eval(y_test, y_pred, 2)

[1m  1/267[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 19ms/step

[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
----- For Class 2 -----
Recall: 0.3772
Precision: 0.3036059240180296
