In [2]:
from dotenv import load_dotenv
from binance.client import Client
import os
import pandas as pd
import numpy as np

load_dotenv()
BINANCE_KEY = os.getenv("BINANCE_KEY")
BINANCE_SECRET = os.getenv("BINANCE_SECRET")
client = Client(
            api_key=BINANCE_KEY,
            api_secret=BINANCE_SECRET,
            requests_params={"timeout": 100},
        )

In [3]:
klines = client.futures_continous_klines(
    pair='ETHUSDT',
    contractType='PERPETUAL',
    interval='1m',
    limit=1500
)



In [4]:
df_klines = pd.DataFrame(
    klines,
    columns=[
        "open_time",
        "open",
        "high",
        "low",
        "close",
        "volume",
        "close_time",
        "quote_asset_volume",
        "number_of_trades",
        "taker_buy_base_asset_volume",
        "taker_buy_quote_asset_volume",
        "ignore",
    ],

)
df_klines['open_time'] = pd.to_datetime(df_klines['open_time'], unit='ms')
df_klines['close_time'] = pd.to_datetime(df_klines['close_time'], unit='ms')
df_klines = df_klines.astype(
    dtype={
        "open": "float",
        "high": "float",
        "low": "float",
        "close": "float",
        "volume": "float",
        "quote_asset_volume": "float",
        "number_of_trades": "int",
        "taker_buy_base_asset_volume": "float",
        "taker_buy_quote_asset_volume": "float",
    }
)
df_klines.dtypes
feature_extraction = df_klines[:700].copy()
final_df = df_klines[700:].copy()

In [5]:
feature_extraction.drop(columns=['ignore'], inplace=True)
final_df.drop(columns=['ignore'], inplace=True)

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

# KNN REGRESSION

In [None]:
# feature_extraction['high-low'] = feature_extraction['high'] - feature_extraction['low']
# feature_extraction['open-low'] = feature_extraction['open'] - feature_extraction['low']
# feature_extraction['open-high'] = feature_extraction['open'] - feature_extraction['high']
# feature_extraction['qav/vol'] = feature_extraction['quote_asset_volume'] / feature_extraction['volume']
# feature_extraction['open+high-low'] = feature_extraction['open'] + feature_extraction['high'] - feature_extraction['low']

In [21]:
df_klines

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2025-04-16 11:46:00,1582.84,1583.70,1582.84,1583.37,2564.178,2025-04-16 11:46:59.999,4.059957e+06,2382,1724.484,2.730446e+06,0
1,2025-04-16 11:47:00,1583.36,1583.62,1582.00,1583.61,1674.046,2025-04-16 11:47:59.999,2.650027e+06,1781,999.279,1.581876e+06,0
2,2025-04-16 11:48:00,1583.61,1584.69,1583.40,1584.28,6501.569,2025-04-16 11:48:59.999,1.029886e+07,3208,3658.679,5.795335e+06,0
3,2025-04-16 11:49:00,1584.28,1584.97,1583.90,1584.76,3588.736,2025-04-16 11:49:59.999,5.686246e+06,2265,2077.232,3.291293e+06,0
4,2025-04-16 11:50:00,1584.75,1584.88,1583.67,1584.32,3213.558,2025-04-16 11:50:59.999,5.090945e+06,2423,1596.558,2.529233e+06,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2025-04-17 12:41:00,1597.83,1598.38,1597.36,1597.98,1243.022,2025-04-17 12:41:59.999,1.986153e+06,1424,776.745,1.241129e+06,0
1496,2025-04-17 12:42:00,1597.98,1598.67,1597.65,1598.00,748.556,2025-04-17 12:42:59.999,1.196375e+06,1214,352.711,5.636928e+05,0
1497,2025-04-17 12:43:00,1598.01,1598.20,1597.49,1598.20,705.848,2025-04-17 12:43:59.999,1.127880e+06,831,267.632,4.276262e+05,0
1498,2025-04-17 12:44:00,1598.20,1598.77,1598.11,1598.11,961.798,2025-04-17 12:44:59.999,1.537512e+06,829,349.586,5.588121e+05,0


In [18]:
knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])
grid_params = {
    # 'rfr__n_estimators': [100,200,300],
    # 'rfr__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    # 'rfr__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    # 'rfr__leaf_size': [10, 20, 30],
    # 'rfr__p': [1, 2,3],
    'ridge__alpha': [0.1, 1, 10],
    'ridge__fit_intercept': [True, False],
    # 'SVR__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    # 'SVR__C': [0.1, 1, 10],
}
grid_search = GridSearchCV(
    knn_pipe,
    param_grid=grid_params,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)
X_train, X_test, y_train, y_test = train_test_split(
    df_klines.drop(columns=['open_time', 'close_time', 'close']),
    df_klines['close'],
    test_size=0.2,
    random_state=42
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters: {'ridge__alpha': 0.1, 'ridge__fit_intercept': True}
Best score: -0.48944119277782966
Test MSE: 0.4411456144044452


In [22]:
df_klines.drop(columns=['open_time', 'close_time', 'close'])

Unnamed: 0,open,high,low,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,1582.84,1583.70,1582.84,2564.178,4.059957e+06,2382,1724.484,2.730446e+06,0
1,1583.36,1583.62,1582.00,1674.046,2.650027e+06,1781,999.279,1.581876e+06,0
2,1583.61,1584.69,1583.40,6501.569,1.029886e+07,3208,3658.679,5.795335e+06,0
3,1584.28,1584.97,1583.90,3588.736,5.686246e+06,2265,2077.232,3.291293e+06,0
4,1584.75,1584.88,1583.67,3213.558,5.090945e+06,2423,1596.558,2.529233e+06,0
...,...,...,...,...,...,...,...,...,...
1495,1597.83,1598.38,1597.36,1243.022,1.986153e+06,1424,776.745,1.241129e+06,0
1496,1597.98,1598.67,1597.65,748.556,1.196375e+06,1214,352.711,5.636928e+05,0
1497,1598.01,1598.20,1597.49,705.848,1.127880e+06,831,267.632,4.276262e+05,0
1498,1598.20,1598.77,1598.11,961.798,1.537512e+06,829,349.586,5.588121e+05,0


In [19]:
from joblib import dump
from google.cloud import storage
import io

storage_client = storage.Client() # Assumes credentials are set up (e.g., via GOOGLE_APPLICATION_CREDENTIALS env var)
bucket = storage_client.bucket('ctrading')
blob = bucket.blob('regressors/ridge_model.joblib')
model_data = io.BytesIO()
dump(best_model, model_data)
model_data.seek(0)  # Move to the beginning of the BytesIO buffer
blob.upload_from_file(model_data, content_type='application/octet-stream')
print("Model uploaded to Google Cloud Storage.")


Model uploaded to Google Cloud Storage.


In [11]:
!gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=Gl7X5dI0IOqBz44fuXOmAKdheW1AdG&access_type=offline&code_challenge=Qvad7DQMMjnqfjtOLWdix-4MbkFNgz50MpErIyrYU6Q&code_challenge_method=S256


You are now logged in as [dadadee02@gmail.com].
Your current project is [future-linker-456622-f8].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [None]:
final_df['svr_feature'] = best_model.predict(
    final_df.drop(columns=['open_time', 'close_time', 'close'])
)
final_df.head()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,svr_feature
700,2025-04-16 17:53:00,1561.0,1563.68,1560.22,1561.36,14754.187,2025-04-16 17:53:59.999,23046480.0,9733,9011.269,14077290.0,1562.748821
701,2025-04-16 17:54:00,1561.35,1563.0,1556.0,1556.74,10103.077,2025-04-16 17:54:59.999,15755040.0,8423,3821.342,5959264.0,1558.461218
702,2025-04-16 17:55:00,1556.73,1558.29,1554.42,1555.33,22672.146,2025-04-16 17:55:59.999,35288130.0,10763,11878.494,18489100.0,1556.423779
703,2025-04-16 17:56:00,1555.34,1556.16,1552.04,1554.02,14903.318,2025-04-16 17:56:59.999,23161520.0,10039,5772.219,8970947.0,1553.654729
704,2025-04-16 17:57:00,1554.02,1554.77,1540.0,1541.25,75509.766,2025-04-16 17:57:59.999,116826600.0,33019,22027.376,34091160.0,1543.072928


# Calc'd features

In [None]:
final_df['high-low'] = final_df['high'] - final_df['low']
final_df['open-low'] = final_df['open'] - final_df['low']
final_df['open-high'] = final_df['open'] - final_df['high']
final_df['qav/vol'] = final_df['quote_asset_volume'] / final_df['volume']
final_df['open+high-low'] = final_df['open'] + final_df['high'] - final_df['low']
# final_df['high/low'] = final_df['high'] / final_df['low']
# final_df['open/low'] = final_df['open'] / final_df['low']
# final_df['open/high'] = final_df['open'] / final_df['high']

In [None]:
final_df["sma_20"] = final_df['close'].rolling(window=20).mean()
final_df["sma_50"] = final_df['close'].rolling(window=50).mean()

# CAT FEATURES

In [None]:
final_df['high_gt_1std_open50'] = (final_df['high'] > (final_df['open'] + 0.5*final_df['open'].std())).astype(int)

In [None]:


X_train, X_test, y_train, y_test = train_test_split(
    final_df.drop(columns=['open_time', 'close_time', 'close']),
    final_df['close'],
    test_size=0.2,
    random_state=42
)

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('model', XGBRegressor(objective='reg:squarederror'))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

feature_importances = pipeline.named_steps['model'].feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

RMSE: 1.480913926715554
                        Feature  Importance
1                          high    0.906478
2                           low    0.091405
9                        sma_50    0.000450
8                        sma_20    0.000366
5              number_of_trades    0.000284
6   taker_buy_base_asset_volume    0.000257
0                          open    0.000237
3                        volume    0.000208
7  taker_buy_quote_asset_volume    0.000163
4            quote_asset_volume    0.000154


In [None]:
csv_path = "feature_importance_runs.csv"

# Determine the run number
if os.path.exists(csv_path):
    old = pd.read_csv(csv_path)
    run = old["run"].max() + 1
else:
    run = 0

# Add run column
importance_df["run"] = run

# Append to CSV
importance_df.to_csv(csv_path, mode="a", header=not os.path.exists(csv_path), index=False)

In [None]:
feat_df = pd.read_csv(csv_path)

feat_df.groupby('Feature').mean().sort_values('Importance', ascending=False)

Unnamed: 0_level_0,Importance,run
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
high,0.652073,7.5
svr_feature,0.329859,15.0
ridge_feature,0.315698,13.5
rfr_feature,0.237851,10.5
qav/vol,0.203988,9.818182
low,0.060064,7.5
open+high-low,0.047676,11.142857
knn_feature,0.013293,4.571429
open/low,0.000828,5.0
open-low,0.000388,9.666667


additional features: svr, ridge, rfr, qav/vol

would require a pre-training df to not have data leakage

In [None]:
funding_params = {
            "symbol": 'ETHUSDT',
            "limit": 1000
            # "startTime": int(start_time.timestamp() * 1000),
            # "endTime": int(end_time.timestamp() * 1000),
            # "interval": interval,
        }
client.futures_funding_rate(
    funding_params
            # time_col="fundingTime",
            # resample_rule=formatted_resample_rule,
            # cols_to_keep=["fundingRate"],
            # value_cast={"fundingRate": float},
)

TypeError: Client.futures_funding_rate() takes 1 positional argument but 2 were given

In [None]:
client.futures_funding_rate(
symbol='BTCUSDT',
limit=1
)

[{'symbol': 'BTCUSDT',
  'fundingTime': 1744848000001,
  'fundingRate': '0.00001404',
  'markPrice': '83994.18360000'}]

In [None]:
client.futures_open_interest_hist(
    symbol='BTCUSDT', limit=1, period='5m'
)

[{'symbol': 'BTCUSDT',
  'sumOpenInterest': '76322.68800000',
  'sumOpenInterestValue': '6421200325.12909800',
  'timestamp': 1744851300000}]

In [None]:
import pandas as pd
btc_df = pd.read_csv("data/BTCUSDT_1m_with_metrics.csv")
unique_vals = {col: btc_df[col].unique() for col in btc_df.columns}
for col, vals in unique_vals.items():
    print(f"{col}: {len(vals)}")

open_time          : 1043
 open                  : 1043
 high                 : 1043
 low                  : 1043
 close                 : 1043
 volume  : 1040
 close_time   : 1043
 quote_asset_volume: 1043
 number_of_trades: 920
 taker_buy_base_asset_volume: 1042
 taker_buy_quote_asset_volume: 1043
 symbol : 1
 fundingRate: 3
 cumulative_funding    : 1043
 sumOpenInterest: 209
 sumOpenInterestValue: 209
 oi_change              : 210
 oi_volume_ratio      : 1043
 liquidation_value: 1
 origQty: 1
 liquidation_intensity: 1
 sma_20               : 1043
 sma_50               : 1043
 sma_7            : 1037
 rsi               : 1043
 bb_upper             : 1043
 bb_lower              : 1043
 macd                 : 1043
 macd_signal         : 1043
 lowest_low           : 1043
 highest_high         : 1043
 stoch_k               : 1003
 stoch_d             : 1042
 atr                : 1032
 plus_di            : 1043
 minus_di           : 1043
 adx               : 1043
 timely_return          :

In [20]:
from joblib import load
storage_client = storage.Client() # Assumes credentials are set up
bucket = storage_client.bucket('ctrading')
blob = bucket.blob('regressors/ridge_model.joblib')
model_data = io.BytesIO()
blob.download_to_file(model_data)
model_data.seek(0)  # Move to the beginning of the BytesIO buffer
mod = load(model_data)
print(mod)

Pipeline(steps=[('scaler', StandardScaler()), ('ridge', Ridge(alpha=0.1))])
