# 0. Setting

## 0.1. Installation

In [None]:
# !pip install python-dotenv
# !pip install python-binance

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from datetime import datetime

In [None]:
load_dotenv()

api_key = os.environ.get("API_KEY") if os.environ.get("API_KEY") else ""
api_secret = os.environ.get("API_SECRET") if os.environ.get("API_SECRET") else ""

## 0.2. Connecting API

In [None]:
import requests
import json

url = "https://api.binance.com"
api_call = "/api/v3/ticker/price"
headers = {"content-type": "application/json", "X-MBX-APIKEY": api_key}

response = requests.get(url + api_call, headers=headers)
response = json.loads(response.text)

print(response)

In [None]:
df = pd.DataFrame.from_records(response)
df

# 1. Binance API

```text
Documentation: https://developers.binance.com/docs/binance-spot-api-docs
```

In [None]:
class BinanceAPI:
    def __init__(self, api_key=None, api_secret=None):
        self.base_url = "https://api.binance.com"
        self.api_key = api_key
        self.api_secret = api_secret
        
    # ข้อมูลเทียนหรือกราฟแท่งเทียนในอดีตสำหรับคู่การเทรดที่กำหนดตาม Symbol
    def get_klines(self, symbol, interval, limit=1000, start_time=None, end_time=None):
        endpoint = "/api/v3/klines"
        params = {
            'symbol': symbol,
            'interval': interval,
            'limit': limit
        }
        
        if start_time:
            params['startTime'] = start_time
        if end_time:
            params['endTime'] = end_time
            
        response = requests.get(self.base_url + endpoint, params=params)
        return response.json()
    
    def get_n_symbol(self, n) :
        endpoint = "/api/v3/ticker/price"
        headers = {"content-type": "application/json", "X-MBX-APIKEY": self.api_key}
        response = requests.get(self.base_url + endpoint, headers=headers)
        response = json.loads(response.text)
        df = pd.DataFrame.from_records(response)
        return df.loc[:n, "symbol"] 
    
    def get_server_time(self, as_timestamp=False) :
        endpoint = "/api/v3/time"
        response = requests.get(self.base_url + endpoint)
        ts = response.json()["serverTime"]
        if as_timestamp:
            return ts
        time = datetime.fromtimestamp(ts / 1000)
        return time.strftime("%Y-%m-%d %H:%M:%S")
    
    # สถิติการเปลี่ยนแปลงราคา 24 ชั่วโมงสำหรับคู่การเทรดที่กำหนดตาม Symbol
    def get_24hr_ticker(self, symbol):
        endpoint = "/api/v3/ticker/24hr"
        params = {'symbol': symbol}
        response = requests.get(self.base_url + endpoint, params=params)
        return response.json()
    
    # ข้อมูล order book ปัจจุบันสำหรับคู่การเทรดที่กำหนดตาม Symbol
    def get_orderbook(self, symbol, limit=100):
        endpoint = "/api/v3/depth"
        params = {'symbol': symbol, 'limit': limit}
        response = requests.get(self.base_url + endpoint, params=params)
        return response.json()

In [None]:
api = BinanceAPI(api_key, api_secret)
api

In [None]:
api.get_server_time()

In [None]:
api.get_n_symbol(10)

In [None]:
api.get_klines("ETHBTC", "1m", limit=5)

# 2. Collect Data

In [None]:
def collect_historical_data(api: BinanceAPI, symbol, interval='1m', days=1, end_time=None):
    if end_time is None:
        end_time = int(api.get_server_time(as_timestamp=True))
    start_time = end_time - (days * 24 * 60 * 60 * 1000)
    
    klines = api.get_klines(
        symbol=symbol,
        interval=interval,
        start_time=start_time,
        end_time=end_time,
    )
    
    # Convert to DataFrame
    df = pd.DataFrame(klines, columns=[
        'timestamp', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_asset_volume', 'number_of_trades',
        'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
    ])
    
    # Convert data types
    numeric_columns = ['open', 'high', 'low', 'close', 'volume', 'quote_asset_volume']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col])
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
    df.drop(["ignore"], axis=1, inplace=True)
    
    return df

In [None]:
api = BinanceAPI(api_key, api_secret)
symbol = "BNBBTC"

train_df = collect_historical_data(api, symbol, interval="5m", days=4)

test_df = collect_historical_data(api, symbol, interval="1m", days=1, end_time=None)
test_df = test_df[test_df['timestamp'] > train_df['timestamp'].max()]

In [None]:
train_df

In [None]:
test_df

<b>Columns</b>

1) <b>open</b>: ราคา *แรกสุด* ที่มีการซื้อขายในช่วงเวลา t
2) <b>high</b>: ราคา *สูงสุด* ที่มีการซื้อขายในช่วงเวลา t
3) <b>low</b>: ราคา *ต่ำสุด* ที่มีการซื้อขายในช่วงเวลา t
4) <b>close</b>: ราคา *สุดท้าย* ที่มีการซื้อขายในช่วงเวลา t

``` 
4 Columns นี้มีการพิจารณาค่าตัวเลขเหมือนกัน เข่น 0.2389 คือ 1 ETH แลกได้ 0.2389 BTC 
```

---

5. <b>volume</b>: จำนวนเหรียญหลักรวมที่มีการซื้อขายในช่วงเวลา t
6. <b>quote_asset_volume</b>: จำนวนเหรียญคู่รวมที่มีการซื้อขายในช่วงเวลา t BTC รวม
7. <b>number_of_trades</b>: จำนวนครั้งที่มีการซื้อขายในช่วงเวลา t
8. <b>taker_buy_base_asset_volume</b>: จำนวนเหรียญหลักรวมที่มีการรีบซื้อในทันทีในช่วงเวลา t
9. <b>taker_buy_quote_asset_volume</b>: จำนวนเหรียญคู่รวมที่มีการรีบซื้อในทันทีในช่วงเวลา t

---

10. <b>timestamp</b>: เวลาเริ่มต้นของการซื้อขาย
11. <b>close_time</b>: เวลาสิ้นสุดของการซื้อขาย

---

In [None]:
train_df.info()

In [None]:
train_df["taker_buy_base_asset_volume"] = train_df["taker_buy_base_asset_volume"].astype(float)
train_df["taker_buy_quote_asset_volume"] = train_df["taker_buy_quote_asset_volume"].astype(float)

train_df.info()

In [None]:
test_df.info()

In [None]:
test_df["taker_buy_base_asset_volume"] = test_df["taker_buy_base_asset_volume"].astype(float)
test_df["taker_buy_quote_asset_volume"] = test_df["taker_buy_quote_asset_volume"].astype(float)

test_df.info()

# 3. Create Indicators

## 3.1. Moving Average

คำนวณค่าเฉลี่ยแบบเคลื่อนที่ทุกๆ n จุด แล้วดูแนวโน้มค่าเฉลี่ยเหล่านั้น

- Bullish: มีแนวโน้มว่าในอนาคต ราคาสูงขึ้น -> 1 ETH มีแนวโน้มจะได้ BTC มากขึ้น
    - ถ้าเรามี ETH อยู่ เราควรถือไว้ หรือซื้อ ETH เพิ่มเติม
    - ถ้าเรามี BTC อยู่ เราควรขายเพื่อซื้อ ETH
     
- Bearish: มีแนวโน้มว่าในอนาคต ราคาลดลง -> 1 ETH มีแนวโน้มจะได้ BTC น้อยลง
    - ถ้าเรามี ETH อยู่ เราควรขายเพื่อซื้อ BTC
    - ถ้าเรามี BTC อยู่ เราควรถือไว้ หรือซื้อ BTC เพิ่มเติม

In [None]:
def add_moving_average(df: pd.DataFrame):
    df = df.copy()
    
    price_col = "close"
    
    # SMA
    for period in [20, 50, 200]:
        df[f'SMA_{period}'] = df[price_col].rolling(period, min_periods=1).mean()
    
    # EMA   
    for period in [12, 26]:
        df[f'EMA_{period}'] = df[price_col].ewm(span=period, adjust=False).mean()
    
    # Golden Cross: 50-day SMA crosses above 200-day SMA (Long-term Bullish)
    df['golden_cross'] = ((df['SMA_50'] > df['SMA_200']) & 
                         (df['SMA_50'].shift(1) <= df['SMA_200'].shift(1))).astype(int)
    
    # Death Cross: 50-day SMA crosses below 200-day SMA (Long-term Bearish)
    df['death_cross'] = ((df['SMA_50'] < df['SMA_200']) & 
                        (df['SMA_50'].shift(1) >= df['SMA_200'].shift(1))).astype(int)
    
    # Bullish Cross: 20-day SMA crosses above 50-day SMA (Short-term Bullish)
    df['bullish_cross'] = ((df['SMA_20'] > df['SMA_50']) & 
                          (df['SMA_20'].shift(1) <= df['SMA_50'].shift(1))).astype(int)
    
    # Bearish Cross: 20-day SMA crosses below 50-day SMA (Short-term Bearish)
    df['bearish_cross'] = ((df['SMA_20'] < df['SMA_50']) & 
                          (df['SMA_20'].shift(1) >= df['SMA_50'].shift(1))).astype(int)
    
    # EMA Bullish Cross: 12-day EMA crosses above 26-day EMA (Momentum turning up)
    df['ema_bullish_cross'] = ((df['EMA_12'] > df['EMA_26']) & 
                              (df['EMA_12'].shift(1) <= df['EMA_26'].shift(1))).astype(int)
    
    # EMA Bearish Cross: 12-day EMA crosses below 26-day EMA (Momentum turning down)
    df['ema_bearish_cross'] = ((df['EMA_12'] < df['EMA_26']) & 
                              (df['EMA_12'].shift(1) >= df['EMA_26'].shift(1))).astype(int)
    
    # 0 = Very Bearish, 5 = Very Bullish
    df['trend_strength'] = ((df[price_col] > df['SMA_20']).astype(int) + 
                           (df[price_col] > df['SMA_50']).astype(int) + 
                           (df[price_col] > df['SMA_200']).astype(int) +
                           (df[price_col] > df['EMA_12']).astype(int) +
                           (df[price_col] > df['EMA_26']).astype(int))
    
    # Price Distance from MAs
    # Positive = Above MA 
    # Negative = Below MA
    df['price_sma20_dist'] = ((df[price_col] - df['SMA_20']) / df['SMA_20']).fillna(0)
    df['price_sma50_dist'] = ((df[price_col] - df['SMA_50']) / df['SMA_50']).fillna(0)
    df['price_sma200_dist'] = ((df[price_col] - df['SMA_200']) / df['SMA_200']).fillna(0)
    
    # Price Distance from EMAs
    # Positive = Above MA 
    # Negative = Below MA
    df['price_ema12_dist'] = ((df[price_col] - df['EMA_12']) / df['EMA_12']).fillna(0)
    df['price_ema26_dist'] = ((df[price_col] - df['EMA_26']) / df['EMA_26']).fillna(0)
    
    # Bullish Alignment: SMA_20 > SMA_50 > SMA_200
    df['bullish_alignment'] = ((df['SMA_20'] > df['SMA_50']) & 
                              (df['SMA_50'] > df['SMA_200'])).astype(int)
    
    return df

## 3.2. Relative Strength Index (RSI)

https://www.investopedia.com/terms/r/rsi.asp
- บ่งบอกความแรงของราคา (Momentum) ในช่วงเวลาที่กำหนด ซึ่งมักนิยมใช้ 14 วัน
- มีค่าอยู่ในช่วงระหว่าง 0 ถึง 100

พิจารณา ETHBTC
- RSI สูง (>70) = คนซื้อ ETH ด้วย BTC เยอะมาก 
    - ETH อาจแพงเกินไปที่จะซื้อตอนนี้ 
    - ตอนนี้เราควรขาย ETH และซื้อ BTC
- RSI ต่ำ (<30) = คนขาย ETH เพื่อซื้อ BTC เยอะมาก 
    - ETH อาจถูกเกินไปที่จะขายตอนนี้ 
    - ตอนนี้เราควรซื้อ ETH และขาย BTC
- RSI กลางๆ (~50) = การซื้อขาย ETH/BTC ปกติดี  
    - ตอนนี้ควรรอดูสถานการณ์ก่อน

In [None]:
def add_rsi(df:pd.DataFrame, period=14):
    
    df = df.copy()
    
    price_col='close'
    
    delta = df[price_col].diff()
    
    gains = delta.where(delta > 0, 0)
    losses = -delta.where(delta < 0, 0)
    
    # Average Gain and Loss with Exponential Moving Average
    avg_gains = gains.ewm(alpha=1/period, adjust=False).mean()
    avg_losses = losses.ewm(alpha=1/period, adjust=False).mean()
    
    # Calculate RSI
    rs = avg_gains / avg_losses
    rsi = 100 - (100 / (1 + rs))

    df['rsi'] = rsi
    
    return df

## 3.3. MACD

- ใช้ดูแนวโน้ม (trend) และโมเมนตัม (momentum) ของราคา 
- ดูจากความแตกต่างของค่าเฉลี่ยเคลื่อนที่แบบ EMA สองเส้น (fast: EMA12 ลบกับ slow: EMA26)

พิจารณา ETHBTC
- MACD > 0 แสดงว่าราคากำลังขึ้น หรือก็คือมีแนวโน้มว่าในอนาคต BTC จะมีราคาสูงขึ้น
    - ถ้าเราถือ ETH เราควรถือไว้ รอขายในอนาคต
    - ถ้าเราถือ BTC เราควรขายเพื่อซื้อ ETH
- MACD < 0 แสดงว่าราคากำลังลง หรือก็คือมีแนวโน้มว่าในอนาคต BTC จะมีราคาต่ำขึ้น
    - ถ้าเราถือ ETH เราควรขายเพื่อซื้อ BTC
    - ถ้าเราถือ BTC เราควรถือไว้ รอขายในอนาคต

** ยิ่งค่าห่างจาก 0 ยิ่งมีแนวโน้มที่จะไปทางนั้นๆ สูง

- จังหวะที่ควรซื้อ หรือขาย คือจังหวะที่เส้นของ MACD ตัดกับเส้น MACD_Signal
    - MACD ตัดแล้วขึ้นสูงกว่า MACD_Signal: เป็นช่วงราคากำลังขึ้น
    - MACD ตัดแล้วต่ำกว่า MACD_Signal: เป็นช่วงราคากำลังลง

In [None]:
def add_macd(df, fast=12, slow=26, signal=9):
    # Calculate EMA fast and slow
    df['ema_fast'] = df['close'].ewm(span=fast, adjust=False).mean()
    df['ema_slow'] = df['close'].ewm(span=slow, adjust=False).mean()
    
    # MACD line
    df['macd'] = df['ema_fast'] - df['ema_slow']
    
    # Signal line
    df['macd_signal'] = df['macd'].ewm(span=signal, adjust=False).mean()
    
    return df

## 3.4. Bollinger Bands

- ประกอบด้วย 3 เส้นหลัก:

1. เส้นกลาง (Middle Band) : SMA ของ close หมายถึงแนวโน้มราคากลาง ๆ ในช่วงเวลาที่กำหนด

2. เส้นบน (Upper Band): SMA + 2SD บ่งบอกขอบเขตราคาที่ "สูงกว่าปกติ" หรือเป็นระดับแนวต้าน

3. เส้นล่าง (Lower Band): SMA - 2SD บ่งบอกขอบเขตราคาที่ "ต่ำกว่าปกติ" หรือเป็นระดับแนวรับ

พิจารณา ETHBTC
- มี 4 Case ที่เกิดขึ้นได้
    1. Close Price ชิด Upper Band: ราคาสูงเกินไปแล้ว อาจมีโอกาสราคาตกลงในเร็ว ๆ นี้
        - ถ้าเราถือ ETH ควรขายเพื่อซื้อ BTC
        - ถ้าเราถือ BTC ควรถือไว้ รอขายในอนาคต
    2. Close Price ชิด Lower Band: ราคาต่ำเกินไปแล้ว อาจมีโอกาสราคาขึ้นในเร็ว ๆ นี้
        - ถ้าเราถือ ETH ควรถือไว้ รอขายในอนาคต
        - ถ้าเราถือ BTC ควรขายเพื่อซื้อ ETH
    3. Upper Band กับ Lower Band เข้ามาชิดกัน: ความผันผวนต่ำ เตรียมเคลื่อนไหว
        - ถ้า Close Price สูงกว่า Upper Band อาจเป็นสัญญาณซื้อ ETH ขาย BTC
        - ถ้า Close Price ต่ำกว่า Lower Band อาจเป็นสัญญาณขาย ETH ซื้อ BTC
    4. Upper Band กับ Lower Band ห่างออกจากกัน: ความผันผวนสูง
        - ถ้า Close Price สูงขึ้น และอยู่ใกล้ Upper Band ควรถือ ETH ต่อเนื่อง หรือซื้อเพิ่ม
        - ถ้า Close Price ลดลง และอยู่ใกล้ Lower Band → ควรขาย ETH ซื้อ BTC

In [None]:
def add_bollinger(df, period=20, std_dev=2):
    df['bb_middle'] = df['close'].rolling(window=period).mean()
    df['bb_std'] = df['close'].rolling(window=period).std()
    
    df['bb_upper'] = df['bb_middle'] + std_dev * df['bb_std']
    df['bb_lower'] = df['bb_middle'] - std_dev * df['bb_std']

    return df

In [None]:
def add_all_indicators(df) :
    df = add_moving_average(df)
    df = add_rsi(df)
    df = add_macd(df)
    df = add_bollinger(df)
    return df

# 4. Create Model

## EDA

In [None]:
ind_train_df = add_all_indicators(train_df)
ind_train_df.set_index("timestamp", inplace=True)

ind_test_df = add_all_indicators(test_df)
ind_test_df.set_index("timestamp", inplace=True)

In [None]:
ind_train_df

In [None]:
ind_test_df

In [None]:
ind_train_df.info()

In [None]:
ind_train_df.isna().sum()

In [None]:
print("bb_middle\n", ind_train_df["bb_middle"])
print("bb_std\n", ind_train_df["bb_std"])
print("bb_upper\n", ind_train_df["bb_upper"])
print("bb_lower\n", ind_train_df["bb_lower"])
print("rsi\n", ind_train_df["rsi"])

In [None]:
ind_train_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)

In [None]:
ind_train_df.isna().sum()

In [None]:
ind_train_df['price_direction'] = np.where(
    ind_train_df['close'].shift(-1) < ind_train_df['close'], -1, 1
)
ind_train_df['price_direction'].value_counts(normalize=True)

- -1 คือ ราคาปิดในอนาคต ***น้อยกว่า*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรขาย
- 1 คือ ราคาปิดในอนาคต ***มากกว่าหรือเท่ากับ*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรซื้อ

In [None]:
ind_test_df.info()

In [None]:
ind_test_df.isna().sum()

In [None]:
print("bb_middle\n", ind_test_df["bb_middle"])
print("bb_std\n", ind_test_df["bb_std"])
print("bb_upper\n", ind_test_df["bb_upper"])
print("bb_lower\n", ind_test_df["bb_lower"])
print("rsi\n", ind_test_df["rsi"])

In [None]:
ind_test_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)

In [None]:
ind_test_df.isna().sum()

In [None]:
ind_test_df['price_direction'] = np.where(
    ind_test_df['close'].shift(-1) < ind_test_df['close'], -1, 1
)
ind_test_df['price_direction'].value_counts(normalize=True)

- -1 คือ ราคาปิดในอนาคต ***น้อยกว่า*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรขาย
- 1 คือ ราคาปิดในอนาคต ***มากกว่าหรือเท่ากับ*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรซื้อ

## Preparing to Train Model

In [None]:
ind_train_df.columns

In [None]:
ind_train_df['price_direction'].value_counts(normalize=True)

In [None]:
random_state = 2025
test_size = 0.25

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

features = [
    'SMA_20', 'SMA_50', 'SMA_200',
    'EMA_12', 'EMA_26',
    'rsi', 
    'macd', 'macd_signal',
    'bb_upper', 'bb_middle', 'bb_lower',
    'trend_strength', 'price_sma20_dist', 'price_sma50_dist',
    'price_sma200_dist',
    'price_ema12_dist', 'price_ema26_dist', 'bullish_alignment'
]

target = 'price_direction'  

In [None]:
def model_evaluation(model, features, target, X_test, y_test):
    
    y_pred = model.predict(X_test)
    
    if set(np.unique(y_test)).issubset({0, 1}):
        y_pred = np.where(y_pred == 0, -1, 1)
        y_test = np.where(y_test == 0, -1, 1)
    # print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    # print("Classification Report:\n", classification_report(y_test, y_pred))
        
    accuracy = accuracy_score(y_test, y_pred)
    # print("Accuracy:", accuracy)

    precision_all = precision_score(y_test, y_pred, average=None, labels=[-1, 1])
    precision_sell = precision_all[0]
    precision_buy = precision_all[1]

    recall_all = recall_score(y_test, y_pred, average=None, labels=[-1, 1])
    recall_sell = recall_all[0]
    recall_buy = recall_all[1]

    f1_all = f1_score(y_test, y_pred, average=None, labels=[-1, 1])
    f1_sell = f1_all[0]
    f1_buy = f1_all[1]

    try:
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:, 1]
        else:
            y_score = model.decision_function(X_test)
        roc_auc = roc_auc_score(y_test, y_score)
    except Exception as e:
        roc_auc = None
    
    return {
        "model": model,
        "features": features,
        "target": target,
        "accuracy": accuracy,
        "precision_sell": precision_sell,
        "precision_buy": precision_buy,  
        "recall_sell": recall_sell,
        "recall_buy": recall_buy,
        "f1_sell": f1_sell,
        "f1_buy": f1_buy,
        "roc_auc": roc_auc
    }
    

In [None]:
def show_model_results(lr_evaluation, rf_evaluation, xgb_evaluation, knn_evaluation, svm_evaluation): 
    data = {
        "model": ["LR", "RF", "XGB", "KNN", "SVM"],
        "accuracy": [lr_evaluation["accuracy"], rf_evaluation["accuracy"], 
                    xgb_evaluation["accuracy"], knn_evaluation["accuracy"], 
                    svm_evaluation["accuracy"]],
        "precision_sell": [lr_evaluation["precision_sell"], rf_evaluation["precision_sell"],
                            xgb_evaluation["precision_sell"], knn_evaluation["precision_sell"], 
                            svm_evaluation["precision_sell"]],
        "precision_buy": [lr_evaluation["precision_buy"], rf_evaluation["precision_buy"],
                            xgb_evaluation["precision_buy"], knn_evaluation["precision_buy"],
                            svm_evaluation["precision_buy"]],
        "recall_sell": [lr_evaluation["recall_sell"], rf_evaluation["recall_sell"],
                            xgb_evaluation["recall_sell"], knn_evaluation["recall_sell"],
                            svm_evaluation["recall_sell"]],
        "recall_buy": [lr_evaluation["recall_buy"], rf_evaluation["recall_buy"],
                            xgb_evaluation["recall_buy"], knn_evaluation["recall_buy"],
                            svm_evaluation["recall_buy"]],
        "f1_sell": [lr_evaluation["f1_sell"], rf_evaluation["f1_sell"],
                        xgb_evaluation["f1_sell"], knn_evaluation["f1_sell"],
                        svm_evaluation["f1_sell"]],
        "f1_buy": [lr_evaluation["f1_buy"], rf_evaluation["f1_buy"],
                        xgb_evaluation["f1_buy"], knn_evaluation["f1_buy"],
                        svm_evaluation["f1_buy"]],
        "roc_auc": [lr_evaluation["roc_auc"], rf_evaluation["roc_auc"],
                    xgb_evaluation["roc_auc"], knn_evaluation["roc_auc"],
                    svm_evaluation["roc_auc"]]
    }
    
    results_df = pd.DataFrame(data)
    results_df.set_index("model", inplace=True)
    return results_df, {"LR": lr_evaluation["features"], "RF": rf_evaluation["features"],
                        "XGB": xgb_evaluation["features"], "KNN": knn_evaluation["features"],
                        "SVM": svm_evaluation["features"]}

## Model Training V1

### Logistic Regression

In [None]:
def lr_model_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                        test_size=test_size, shuffle=False, 
                                                        random_state=random_state)
    model = LogisticRegression(random_state=random_state)
    model.fit(X_train, y_train)

    # 3. Scoring
    lr_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return lr_evaluation, model

### Random Forest Classification

In [None]:
def rf_model_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                        test_size=test_size, shuffle=False, 
                                                        random_state=random_state)
    model = RandomForestClassifier(random_state=random_state)
    model.fit(X_train, y_train)

    # 3. Scoring
    rf_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return rf_evaluation, model

### XGBoost

In [None]:
def xgb_model_v1(df):
    X = df[features]
    y = df[target].map({-1: 0, 1: 1})
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                        test_size=test_size, shuffle=False, 
                                                        random_state=random_state)

    model = XGBClassifier(random_state=random_state)
    model.fit(X_train, y_train)

    # 3. Scoring
    xgb_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return xgb_evaluation, model

### K-Nearest Neighbors

In [None]:
def knn_model_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                        test_size=test_size, shuffle=False, 
                                                        random_state=random_state)
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)

    # 3. Scoring
    knn_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return knn_evaluation, model

### Support Vector Machine

In [None]:
def svm_model_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, 
                                                        shuffle=False, random_state=random_state)
    model = SVC(random_state=random_state)
    model.fit(X_train, y_train)

    # 3. Scoring
    svm_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return svm_evaluation, model

## Model Summary V1

In [None]:
lr_evaluation, lr_model = lr_model_v1(ind_train_df)
rf_evaluation, rf_model = rf_model_v1(ind_train_df)
xgb_evaluation, xgb_model = xgb_model_v1(ind_train_df)
knn_evaluation, knn_model = knn_model_v1(ind_train_df)
svm_evaluation, svm_model = svm_model_v1(ind_train_df)

results_df_v1, result_features_v1 = show_model_results(lr_evaluation, rf_evaluation, xgb_evaluation, 
                                                       knn_evaluation, svm_evaluation)

results_df_v1

## Model Training V2


- เพิ่ม Hyperparameter Tuning 
- เพิ่ม SMOTE

In [None]:
from collections import Counter

features = [
    'SMA_20', 'SMA_50', 'SMA_200',
    'EMA_12', 'EMA_26',
    'rsi', 
    'macd', 'macd_signal',
    'bb_upper', 'bb_middle', 'bb_lower',
    'trend_strength', 'price_sma20_dist', 'price_sma50_dist',
    'price_sma200_dist',
    'price_ema12_dist', 'price_ema26_dist', 'bullish_alignment'
]

target = 'price_direction' 

### Logistic Regression

In [None]:
def lr_model_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Hyperparameter Tuning
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear'],
        'class_weight': ['balanced']
    }
    grid = GridSearchCV(
        LogisticRegression(max_iter=1000, random_state=random_state), 
        param_grid, scoring='accuracy'
    )
    grid.fit(X_train_res, y_train_res)
    # print("Best Params:", grid.best_params_)
    # print("Best CV Score:", grid.best_score_)

    # 5. Train model
    model = LogisticRegression(max_iter=1000, **grid.best_params_, random_state=random_state)
    model.fit(X_train_res, y_train_res)

    # 6. Scoring
    lr_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return lr_evaluation, model

### Random Forest Regression

In [None]:
def rf_model_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Hyperparameter Tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [4, 8, 16, None],
        'min_samples_split': [2, 5, 10],
        'class_weight': ['balanced']
    }

    grid = GridSearchCV(RandomForestClassifier(random_state=random_state), param_grid,
                        scoring='accuracy')
    grid.fit(X_train_res, y_train_res)
    # print("Best Params:", grid.best_params_)
    # print("Best CV Score:", grid.best_score_)

    # 5. Train model
    model = RandomForestClassifier(random_state=random_state, **grid.best_params_)
    model.fit(X_train_res, y_train_res)

    # 6. Scoring
    rf_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return rf_evaluation, model

### XGBoost

In [None]:
def xgb_model_v2(df):
    X = df[features]
    y = df[target].map({-1: 0, 1: 1})
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Hyperparameter Tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
    }

    grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=random_state),
        param_grid, scoring='accuracy'
    )
    grid.fit(X_train_res, y_train_res)
    # print("Best Params:", grid.best_params_)
    # print("Best CV Score:", grid.best_score_)

    # 5. Train model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        random_state=random_state, **grid.best_params_)
    model.fit(X_train_res, y_train_res)

    # 6. Scoring
    xgb_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return xgb_evaluation, model

### K-Nearest Neighbors

In [None]:
def knn_model_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Hyperparameter Tuning
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    }
    grid = GridSearchCV(KNeighborsClassifier(), param_grid, scoring='accuracy')
    grid.fit(X_train_res, y_train_res)
    # print("Best Params:", grid.best_params_)
    # print("Best CV Score:", grid.best_score_)

    # 5. Train model
    model = KNeighborsClassifier(**grid.best_params_)
    model.fit(X_train_res, y_train_res)

    # 6. Scoring
    knn_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return knn_evaluation, model

### Support Vector Machine

In [None]:
def svm_model_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Hyperparameter Tuning
    param_grid = {
        'C': [1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'],
        'class_weight': ['balanced']
    }
    grid = GridSearchCV(SVC(), param_grid, scoring='accuracy')
    grid.fit(X_train_res, y_train_res)
    # print("Best Params:", grid.best_params_)
    # print("Best CV Score:", grid.best_score_)

    # 5. Train model
    model = SVC(**grid.best_params_, random_state=random_state)
    model.fit(X_train_res, y_train_res)

    # 6. Scoring
    svm_evaluation = model_evaluation(model, features, target, X_test, y_test)
    return svm_evaluation, model

## Model Result V2

In [None]:
lr_evaluation_v2, lr_model2 = lr_model_v2(ind_train_df)
rf_evaluation_v2, rf_model2 = rf_model_v2(ind_train_df)
xgb_evaluation_v2, xgb_model2 = xgb_model_v2(ind_train_df)
knn_evaluation_v2, knn_model2 = knn_model_v2(ind_train_df)
svm_evaluation_v2, svm_model2 = svm_model_v2(ind_train_df)

results_df_v2, result_features_v2 = show_model_results(lr_evaluation_v2, 
                                                       rf_evaluation_v2, xgb_evaluation_v2, 
                                                       knn_evaluation_v2, svm_evaluation_v2)

results_df_v2

## Model Training V3

- เพิ่มการทำ Dynamic Features Selection
- เพิ่ม Class Weight เพิ้มโอกาสการทาย class -1 เพราะว่าใน Data class 1 มีสัดส่วนมาก เมื่อเทียบกับ class -1

In [None]:
features = [
    'SMA_20', 'SMA_50', 'SMA_200',
    'EMA_12', 'EMA_26',
    'rsi', 
    'macd', 'macd_signal',
    'bb_upper', 'bb_middle', 'bb_lower',
    'trend_strength', 'price_sma20_dist', 'price_sma50_dist',
    'price_sma200_dist',
    'price_ema12_dist', 'price_ema26_dist', 'bullish_alignment'
]

target = 'price_direction' 

In [None]:
def get_top_features_by_coef(model, top_n=10):
    coef_abs = np.abs(model.coef_[0])
    feature_importance = pd.Series(coef_abs, index=features)
    top_feature_importance = feature_importance.sort_values(ascending=False).head(top_n)
    return top_feature_importance.index.tolist()

In [None]:
def get_top_features_by_importance(model, top_n=10):
    importances = model.feature_importances_
    feature_importance = pd.Series(importances, index=features)
    top_feature_importance = feature_importance.sort_values(ascending=False).head(top_n)
    return top_feature_importance.index.tolist()

### Logistic Regression

In [None]:
def lr_model_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Fit model ด้วย features ทั้งหมด
    model_all = LogisticRegression(max_iter=1000, random_state=random_state, class_weight='balanced')
    model_all.fit(X_train_res, y_train_res)

    # 5. เลือก top N feature
    top_features = get_top_features_by_coef(model_all)

    # 6. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    X_top_scaled = scaler.fit_transform(X_top)
    X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
        X_top_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 7. Balance Data (SMOTE) สำหรับ top feature
    counter_top = Counter(y_train_top)
    min_class_count_top = min(counter_top.values())
    if min_class_count_top <= 1:
        X_train_res_top, y_train_res_top = X_train_top, y_train_top
    else:
        k_neighbors_top = min(5, min_class_count_top - 1)
        sm_top = SMOTE(random_state=random_state, k_neighbors=k_neighbors_top)
        X_train_res_top, y_train_res_top = sm_top.fit_resample(X_train_top, y_train_top)

    # 8. Hyperparameter Tuning
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear'],
        'class_weight': [{-1: 5, 1: 1}, {-1: 2, 1: 1}, "balanced", {-1: 1, 1: 2}, {-1: 1, 1: 5}]
    }
    grid = GridSearchCV(
        LogisticRegression(max_iter=1000, random_state=random_state), 
        param_grid, scoring='accuracy'
    )
    grid.fit(X_train_res_top, y_train_res_top)

    # 9. Train model final
    model = LogisticRegression(max_iter=1000, **grid.best_params_, random_state=random_state)
    model.fit(X_train_res_top, y_train_res_top)

    lr_evaluation = model_evaluation(model, top_features, target, X_test_top, y_test_top)
    
    return lr_evaluation, model

### Random Forest Classification

In [None]:
def rf_model_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Fit model ด้วย features ทั้งหมด
    model_all = RandomForestClassifier(random_state=random_state, class_weight='balanced')
    model_all.fit(X_train_res, y_train_res)

    # 5. เลือก top N feature
    top_features = get_top_features_by_importance(model_all)

    # 6. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    X_top_scaled = scaler.fit_transform(X_top)
    X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
        X_top_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 7. Balance Data (SMOTE) สำหรับ top feature
    counter_top = Counter(y_train_top)
    min_class_count_top = min(counter_top.values())
    if min_class_count_top <= 1:
        X_train_res_top, y_train_res_top = X_train_top, y_train_top
    else:
        k_neighbors_top = min(5, min_class_count_top - 1)
        sm_top = SMOTE(random_state=random_state, k_neighbors=k_neighbors_top)
        X_train_res_top, y_train_res_top = sm_top.fit_resample(X_train_top, y_train_top)

    # 8. Hyperparameter Tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [4, 8, 16, None],
        'min_samples_split': [2, 5, 10],
        'class_weight': [{-1: 5, 1: 1}, {-1: 2, 1: 1}, "balanced", {-1: 1, 1: 2}, {-1: 1, 1: 5}]
    }
    grid = GridSearchCV(
        RandomForestClassifier(random_state=random_state),
        param_grid, scoring='accuracy'
    )
    grid.fit(X_train_res_top, y_train_res_top)

    # 9. Train model final
    model = RandomForestClassifier(random_state=random_state, **grid.best_params_)
    model.fit(X_train_res_top, y_train_res_top)
    
    rf_evaluation = model_evaluation(model, top_features, target, X_test_top, y_test_top)

    return rf_evaluation, model

### XGBoost

In [None]:
def xgb_model_v3(df):
    X = df[features]
    y = df[target].map({-1: 0, 1: 1})
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Fit model ด้วย features ทั้งหมด
    model_all = XGBClassifier(eval_metric='logloss', random_state=random_state)
    model_all.fit(X_train_res, y_train_res)

    # 5. เลือก top N feature (ถ้าเท่ากับ features ทั้งหมด ให้ลด top_n)
    top_n = 10
    top_features = get_top_features_by_importance(model_all, top_n=top_n)
    while len(top_features) == len(features) and top_n > 1:
        top_n -= 1
        top_features = get_top_features_by_importance(model_all, top_n=top_n)

    # 6. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    scaler_top = StandardScaler()
    X_top_scaled = scaler_top.fit_transform(X_top)
    X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
        X_top_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 7. Balance Data (SMOTE) สำหรับ top feature
    counter_top = Counter(y_train_top)
    min_class_count_top = min(counter_top.values())
    if min_class_count_top <= 1:
        X_train_res_top, y_train_res_top = X_train_top, y_train_top
    else:
        k_neighbors_top = min(5, min_class_count_top - 1)
        sm_top = SMOTE(random_state=random_state, k_neighbors=k_neighbors_top)
        X_train_res_top, y_train_res_top = sm_top.fit_resample(X_train_top, y_train_top)

    # 8. Hyperparameter Tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        "scale_pos_weight": [1, 2, 5],
    }

    grid = GridSearchCV(
        XGBClassifier(eval_metric='logloss', random_state=random_state),
        param_grid, scoring='accuracy'
    )
    grid.fit(X_train_res_top, y_train_res_top)

    # 9. Train model final
    model = XGBClassifier(eval_metric='logloss', random_state=random_state, **grid.best_params_)
    model.fit(X_train_res_top, y_train_res_top)

    xgb_evaluation = model_evaluation(model, top_features, target, X_test_top, y_test_top)

    return xgb_evaluation, model

### K-Nearest Neighbors

In [None]:
def knn_model_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Fit model ด้วย features ทั้งหมด
    from sklearn.feature_selection import SelectKBest, f_classif

    top_n = 10
    selector = SelectKBest(score_func=f_classif, k=min(top_n, X_train_res.shape[1]))
    selector.fit(X_train_res, y_train_res)
    mask = selector.get_support()
    top_features = [feature for feature, selected in zip(features, mask) if selected]

    # 6. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    X_top_scaled = scaler.fit_transform(X_top)
    X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
        X_top_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 7. Balance Data (SMOTE) สำหรับ top feature
    counter_top = Counter(y_train_top)
    min_class_count_top = min(counter_top.values())
    if min_class_count_top <= 1:
        X_train_res_top, y_train_res_top = X_train_top, y_train_top
    else:
        k_neighbors_top = min(5, min_class_count_top - 1)
        sm_top = SMOTE(random_state=random_state, k_neighbors=k_neighbors_top)
        X_train_res_top, y_train_res_top = sm_top.fit_resample(X_train_top, y_train_top)

    # 8. Hyperparameter Tuning
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        
    }
    grid = GridSearchCV(KNeighborsClassifier(), param_grid, scoring='accuracy')
    grid.fit(X_train_res_top, y_train_res_top)

    # 9. Train model final
    model = KNeighborsClassifier(**grid.best_params_)
    model.fit(X_train_res_top, y_train_res_top)

    knn_evaluation = model_evaluation(model, top_features, target, X_test_top, y_test_top)

    return knn_evaluation, model

### Support Vector Machine

In [None]:
def svm_model_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 3. Balance Data (SMOTE with dynamic k_neighbors)
    counter = Counter(y_train)
    min_class_count = min(counter.values())
    if min_class_count <= 1:
        X_train_res, y_train_res = X_train, y_train
    else:
        k_neighbors = min(5, min_class_count - 1)
        sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # 4. Fit model ด้วย features ทั้งหมด
    model_all = SVC(kernel='linear', class_weight='balanced', random_state=random_state)
    model_all.fit(X_train_res, y_train_res)

    # 5. เลือก top N feature (ใช้ absolute coef_)
    top_features = get_top_features_by_coef(model_all)

    # 6. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    X_top_scaled = scaler.fit_transform(X_top)
    X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
        X_top_scaled, y, test_size=test_size, shuffle=False, random_state=random_state
    )

    # 7. Balance Data (SMOTE) สำหรับ top feature
    counter_top = Counter(y_train_top)
    min_class_count_top = min(counter_top.values())
    if min_class_count_top <= 1:
        X_train_res_top, y_train_res_top = X_train_top, y_train_top
    else:
        k_neighbors_top = min(5, min_class_count_top - 1)
        sm_top = SMOTE(random_state=random_state, k_neighbors=k_neighbors_top)
        X_train_res_top, y_train_res_top = sm_top.fit_resample(X_train_top, y_train_top)

    # 8. Hyperparameter Tuning
    param_grid = {
        'C': [1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'],
        'class_weight': [{-1: 5, 1: 1}, {-1: 2, 1: 1}, "balanced", {-1: 1, 1: 2}, {-1: 1, 1: 5}]
    }
    grid = GridSearchCV(SVC(), param_grid, scoring='accuracy')
    grid.fit(X_train_res_top, y_train_res_top)

    # 9. Train model final
    model = SVC(**grid.best_params_, random_state=random_state)
    model.fit(X_train_res_top, y_train_res_top)

    svm_evaluation = model_evaluation(model, top_features, target, X_test_top, y_test_top)
    return svm_evaluation, model

## Model Result V3

In [None]:
lr_evaluation_v3, lr_model3 = lr_model_v3(ind_train_df)
rf_evaluation_v3, rf_model3 = rf_model_v3(ind_train_df)
xgb_evaluation_v3, xgb_model3 = xgb_model_v3(ind_train_df)
knn_evaluation_v3, knn_model3 = knn_model_v3(ind_train_df)
svm_evaluation_v3, svm_model3 = svm_model_v3(ind_train_df)

results_df_v3, result_features_v3 = show_model_results(lr_evaluation_v3, rf_evaluation_v3, 
                                                       xgb_evaluation_v3, knn_evaluation_v3, 
                                                       svm_evaluation_v3)

results_df_v3

## Comparison

In [None]:
metrics = ['accuracy', 'precision_sell', 'precision_buy', 'recall_sell', 
           'recall_buy', 'f1_sell', 'f1_buy', 'roc_auc']

models = results_df_v1.index.tolist()
x = np.arange(len(models))
width = 0.25

fig, axes = plt.subplots(len(metrics), 1, figsize=(14, 4 * len(metrics)), sharey=True)

for i, metric in enumerate(metrics):
    axes[i].bar(x - width, results_df_v1[metric], width, label='V1')
    axes[i].bar(x, results_df_v2[metric], width, label='V2')
    axes[i].bar(x + width, results_df_v3[metric], width, label='V3')
    axes[i].set_title(metric)
    axes[i].set_xticks(x)
    axes[i].set_xticklabels(models, rotation=20)
    axes[i].legend()
    
plt.tight_layout()
plt.show()

# 5. Picking Model

ทดสอบกับหลายๆ เหรียญ แล้วหา Model ที่ดีที่สุด

In [None]:
def create_train_test_ind_df(symbol):
    api = BinanceAPI(api_key, api_secret)
    train_df = collect_historical_data(api, symbol, interval="5m", days=4)
    test_df = collect_historical_data(api, symbol, interval="1m", days=1, end_time=None)
    test_df = test_df[test_df['timestamp'] > train_df['timestamp'].max()]
    
    ind_train_df = add_all_indicators(train_df)
    ind_train_df.set_index("timestamp", inplace=True)
    ind_train_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)
    ind_train_df['price_direction'] = np.where(
        ind_train_df['close'].shift(-1) < ind_train_df['close'], -1, 1
    )
    
    ind_test_df = add_all_indicators(test_df)
    ind_test_df.set_index("timestamp", inplace=True)
    ind_test_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)
    ind_test_df['price_direction'] = np.where(
        ind_test_df['close'].shift(-1) < ind_test_df['close'], -1, 1
    )
    
    return ind_train_df, ind_test_df

In [None]:
def choose_metric_weights(buy_prop):
    if buy_prop < 0.3:
        return {'f1_buy': 0.5, 'recall_buy': 0.3, 'roc_auc': 0.2}
    elif buy_prop > 0.7:
        return {'f1_sell': 0.5, 'recall_sell': 0.3, 'roc_auc': 0.2}
    else:
        return {'f1_sell': 0.25, 'f1_buy': 0.25, 'roc_auc': 0.5}

In [None]:
def select_best_model(result_df, train_df, 
                      imbalance_threshold=0.2, 
                      severe_imbalance_penalty=0.7, 
                      mild_imbalance_penalty=0.9):
    class_counts = train_df['price_direction'].value_counts(normalize=True).sort_index()
    buy_prop = class_counts.get(1, 0.0)
    metric_weights = choose_metric_weights(buy_prop)

    def score_row(row):
        if row['recall_sell'] == 0 or row['recall_buy'] == 0:
            return -999
        score = sum(row[m] * w for m, w in metric_weights.items())
        if buy_prop < imbalance_threshold:
            score *= severe_imbalance_penalty
        elif buy_prop < 0.3:
            score *= mild_imbalance_penalty
        return score

    scores = result_df.apply(score_row, axis=1)
    best_model = scores.idxmax()
    return best_model

In [None]:
def get_best_model(ind_train_df):
    
    if ind_train_df.shape[0] != 0:
        # --- V1 ---
        lr_eval_v1, lr_model_v1_ = lr_model_v1(ind_train_df)
        rf_eval_v1, rf_model_v1_ = rf_model_v1(ind_train_df)
        xgb_eval_v1, xgb_model_v1_ = xgb_model_v1(ind_train_df)
        knn_eval_v1, knn_model_v1_ = knn_model_v1(ind_train_df)
        svm_eval_v1, svm_model_v1_ = svm_model_v1(ind_train_df)

        result_df_v1, result_features_v1 = show_model_results(
            lr_eval_v1, rf_eval_v1, xgb_eval_v1, knn_eval_v1, svm_eval_v1
        )
        result_df_v1.index = [f"{name}_V1" for name in result_df_v1.index]

        # --- V2 ---
        lr_eval_v2, lr_model_v2_ = lr_model_v2(ind_train_df)
        rf_eval_v2, rf_model_v2_ = rf_model_v2(ind_train_df)
        xgb_eval_v2, xgb_model_v2_ = xgb_model_v2(ind_train_df)
        knn_eval_v2, knn_model_v2_ = knn_model_v2(ind_train_df)
        svm_eval_v2, svm_model_v2_ = svm_model_v2(ind_train_df)

        result_df_v2, result_features_v2 = show_model_results(
            lr_eval_v2, rf_eval_v2, xgb_eval_v2, knn_eval_v2, svm_eval_v2
        )
        result_df_v2.index = [f"{name}_V2" for name in result_df_v2.index]

        # --- V3 ---
        lr_eval_v3, lr_model_v3_ = lr_model_v3(ind_train_df)
        rf_eval_v3, rf_model_v3_ = rf_model_v3(ind_train_df)
        xgb_eval_v3, xgb_model_v3_ = xgb_model_v3(ind_train_df)
        knn_eval_v3, knn_model_v3_ = knn_model_v3(ind_train_df)
        svm_eval_v3, svm_model_v3_ = svm_model_v3(ind_train_df)

        result_df_v3, result_features_v3 = show_model_results(
            lr_eval_v3, rf_eval_v3, xgb_eval_v3, knn_eval_v3, svm_eval_v3
        )
        result_df_v3.index = [f"{name}_V3" for name in result_df_v3.index]

        result_df = pd.concat([result_df_v1, result_df_v2, result_df_v3])
        best_model = select_best_model(result_df, ind_train_df)
        
        map_model = {
            "LR_V1": [lr_model_v1_, result_features_v1["LR"]],
            "LR_V2": [lr_model_v2_, result_features_v2["LR"]],
            "LR_V3": [lr_model_v3_, result_features_v3["LR"]],
            "RF_V1": [rf_model_v1_, result_features_v1["RF"]],
            "RF_V2": [rf_model_v2_, result_features_v2["RF"]],
            "RF_V3": [rf_model_v3_, result_features_v3["RF"]],
            "XGB_V1": [xgb_model_v1_, result_features_v1["XGB"]],
            "XGB_V2": [xgb_model_v2_, result_features_v2["XGB"]],
            "XGB_V3": [xgb_model_v3_, result_features_v3["XGB"]],
            "KNN_V1": [knn_model_v1_, result_features_v1["KNN"]],
            "KNN_V2": [knn_model_v2_, result_features_v2["KNN"]],
            "KNN_V3": [knn_model_v3_, result_features_v3["KNN"]],
            "SVM_V1": [svm_model_v1_, result_features_v1["SVM"]],
            "SVM_V2": [svm_model_v2_, result_features_v2["SVM"]],
            "SVM_V3": [svm_model_v3_, result_features_v3["SVM"]],
        }
        
        return map_model[best_model]

    return None

# 6. Simulation

In [None]:
def simulate(symbol=None):
    api = BinanceAPI(api_key, api_secret)
    print("Available symbols:")
    for s in api.get_n_symbol(10):
        print(s)
    if not symbol:
        # symbol = input("Symbol: ").strip().upper()
        if not symbol:
            symbol = "ETHBTC"
    print(f"\nSelected symbol: {symbol}")

    ind_train_df, ind_test_df = create_train_test_ind_df(symbol)
    if ind_train_df.shape[0] == 0 or ind_test_df.shape[0] == 0:
        print(f"{symbol} is invalid or has insufficient data. Try again")
        return

    print(f"\nTrain set size: {ind_train_df.shape}, Test set size: {ind_test_df.shape}")

    best_model, best_features = get_best_model(ind_train_df)

    x_test = ind_test_df[best_features]
    y_test = ind_test_df['price_direction']
    y_pred = best_model.predict(x_test)

    print("\n=== Test Result ===")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
simulate()