# 0. Setting

## 0.1. Installation

In [None]:
# !pip install python-dotenv
# !pip install python-binance

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from datetime import datetime

In [None]:
load_dotenv()

api_key = os.environ.get("API_KEY") if os.environ.get("API_KEY") else ""
api_secret = os.environ.get("API_SECRET") if os.environ.get("API_SECRET") else ""

## 0.2. Connecting API

In [None]:
import requests
import json

url = "https://api.binance.com"
api_call = "/api/v3/ticker/price"
headers = {"content-type": "application/json", "X-MBX-APIKEY": api_key}

try:
    response = requests.get(url + api_call, headers=headers, timeout=10)
    response.raise_for_status()
    response_data = response.json()
    print(response_data[:1])
except requests.RequestException as e:
    print(e)
    response_data = []

In [None]:
if response_data:
    df = pd.DataFrame.from_records(response_data)
    print(df.head())
else:
    print("No data from API")

# 1. Binance API

```text
Documentation: https://developers.binance.com/docs/binance-spot-api-docs
```

In [None]:
class BinanceAPI:
    def __init__(self, api_key=None, api_secret=None):
        self.base_url = "https://api.binance.com"
        self.api_key = api_key
        self.api_secret = api_secret
        
    # ข้อมูลเทียนหรือกราฟแท่งเทียนในอดีตสำหรับคู่การเทรดที่กำหนดตาม Symbol
    def get_klines(self, symbol, interval, limit=1000, start_time=None, end_time=None):
        endpoint = "/api/v3/klines"
        params = {
            'symbol': symbol,
            'interval': interval,
            'limit': limit
        }
        
        if start_time:
            params['startTime'] = start_time
        if end_time:
            params['endTime'] = end_time
            
        response = requests.get(self.base_url + endpoint, params=params)
        return response.json()
    
    def get_n_symbol(self, n) :
        endpoint = "/api/v3/ticker/price"
        headers = {"content-type": "application/json", "X-MBX-APIKEY": self.api_key}
        response = requests.get(self.base_url + endpoint, headers=headers)
        response = json.loads(response.text)
        df = pd.DataFrame.from_records(response)
        return df.loc[:n, "symbol"] 
    
    def get_server_time(self, as_timestamp=False) :
        endpoint = "/api/v3/time"
        response = requests.get(self.base_url + endpoint)
        ts = response.json()["serverTime"]
        if as_timestamp:
            return ts
        time = datetime.fromtimestamp(ts / 1000)
        return time.strftime("%Y-%m-%d %H:%M:%S")
    
    # สถิติการเปลี่ยนแปลงราคา 24 ชั่วโมงสำหรับคู่การเทรดที่กำหนดตาม Symbol
    def get_24hr_ticker(self, symbol):
        endpoint = "/api/v3/ticker/24hr"
        params = {'symbol': symbol}
        response = requests.get(self.base_url + endpoint, params=params)
        return response.json()
    
    # ข้อมูล order book ปัจจุบันสำหรับคู่การเทรดที่กำหนดตาม Symbol
    def get_orderbook(self, symbol, limit=100):
        endpoint = "/api/v3/depth"
        params = {'symbol': symbol, 'limit': limit}
        response = requests.get(self.base_url + endpoint, params=params)
        return response.json()

In [None]:
api = BinanceAPI(api_key, api_secret)
api

In [None]:
api.get_server_time()

In [None]:
api.get_n_symbol(10)

In [None]:
api.get_klines("ETHBTC", "1m", limit=5)

# 2. Collect Data

In [None]:
def collect_historical_data(api: BinanceAPI, symbol, interval='1m', days=1, end_time=None):
    if end_time is None:
        end_time = int(api.get_server_time(as_timestamp=True))
    start_time = end_time - (days * 24 * 60 * 60 * 1000)
    
    klines = api.get_klines(
        symbol=symbol,
        interval=interval,
        start_time=start_time,
        end_time=end_time,
    )
    
    # Convert to DataFrame
    df = pd.DataFrame(klines, columns=[
        'timestamp', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_asset_volume', 'number_of_trades',
        'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
    ])
    
    # Convert data types
    numeric_columns = ['open', 'high', 'low', 'close', 'volume', 'quote_asset_volume']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col])
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
    df.drop(["ignore"], axis=1, inplace=True)
    
    return df

In [None]:
api = BinanceAPI(api_key, api_secret)
symbol = "ETHBTC"

train_df = collect_historical_data(api, symbol, interval="5m", days=4)
test_df = collect_historical_data(api, symbol, interval="5m", days=1)

test_df = test_df[test_df['timestamp'] > train_df['timestamp'].max()]

In [None]:
train_df

In [None]:
test_df

<b>Columns</b>

1) <b>open</b>: ราคา *แรกสุด* ที่มีการซื้อขายในช่วงเวลา t
2) <b>high</b>: ราคา *สูงสุด* ที่มีการซื้อขายในช่วงเวลา t
3) <b>low</b>: ราคา *ต่ำสุด* ที่มีการซื้อขายในช่วงเวลา t
4) <b>close</b>: ราคา *สุดท้าย* ที่มีการซื้อขายในช่วงเวลา t

``` 
4 Columns นี้มีการพิจารณาค่าตัวเลขเหมือนกัน เข่น 0.2389 คือ 1 ETH แลกได้ 0.2389 BTC 
```

---

5. <b>volume</b>: จำนวนเหรียญหลักรวมที่มีการซื้อขายในช่วงเวลา t
6. <b>quote_asset_volume</b>: จำนวนเหรียญคู่รวมที่มีการซื้อขายในช่วงเวลา t BTC รวม
7. <b>number_of_trades</b>: จำนวนครั้งที่มีการซื้อขายในช่วงเวลา t
8. <b>taker_buy_base_asset_volume</b>: จำนวนเหรียญหลักรวมที่มีการรีบซื้อในทันทีในช่วงเวลา t
9. <b>taker_buy_quote_asset_volume</b>: จำนวนเหรียญคู่รวมที่มีการรีบซื้อในทันทีในช่วงเวลา t

---

10. <b>timestamp</b>: เวลาเริ่มต้นของการซื้อขาย
11. <b>close_time</b>: เวลาสิ้นสุดของการซื้อขาย

---

In [None]:
train_df.info()

In [None]:
train_df["taker_buy_base_asset_volume"] = train_df["taker_buy_base_asset_volume"].astype(float)
train_df["taker_buy_quote_asset_volume"] = train_df["taker_buy_quote_asset_volume"].astype(float)

train_df.info()

In [None]:
test_df.info()

In [None]:
test_df["taker_buy_base_asset_volume"] = test_df["taker_buy_base_asset_volume"].astype(float)
test_df["taker_buy_quote_asset_volume"] = test_df["taker_buy_quote_asset_volume"].astype(float)

test_df.info()

# 3. Create Indicators

## 3.1. Moving Average

คำนวณค่าเฉลี่ยแบบเคลื่อนที่ทุกๆ n จุด แล้วดูแนวโน้มค่าเฉลี่ยเหล่านั้น

- Bullish: มีแนวโน้มว่าในอนาคต ราคาสูงขึ้น -> 1 ETH มีแนวโน้มจะได้ BTC มากขึ้น
    - ถ้าเรามี ETH อยู่ เราควรถือไว้ หรือซื้อ ETH เพิ่มเติม
    - ถ้าเรามี BTC อยู่ เราควรขายเพื่อซื้อ ETH
     
- Bearish: มีแนวโน้มว่าในอนาคต ราคาลดลง -> 1 ETH มีแนวโน้มจะได้ BTC น้อยลง
    - ถ้าเรามี ETH อยู่ เราควรขายเพื่อซื้อ BTC
    - ถ้าเรามี BTC อยู่ เราควรถือไว้ หรือซื้อ BTC เพิ่มเติม

In [None]:
def add_moving_average(df: pd.DataFrame):
    df = df.copy()
    
    price_col = "close"
    
    # SMA
    for period in [20, 50, 200]:
        df[f'SMA_{period}'] = df[price_col].rolling(period, min_periods=1).mean()
    
    # EMA   
    for period in [12, 26]:
        df[f'EMA_{period}'] = df[price_col].ewm(span=period, adjust=False).mean()
    
    # Golden Cross: 50-day SMA crosses above 200-day SMA (Long-term Bullish)
    df['golden_cross'] = ((df['SMA_50'] > df['SMA_200']) & 
                         (df['SMA_50'].shift(1) <= df['SMA_200'].shift(1))).astype(int)
    
    # Death Cross: 50-day SMA crosses below 200-day SMA (Long-term Bearish)
    df['death_cross'] = ((df['SMA_50'] < df['SMA_200']) & 
                        (df['SMA_50'].shift(1) >= df['SMA_200'].shift(1))).astype(int)
    
    # Bullish Cross: 20-day SMA crosses above 50-day SMA (Short-term Bullish)
    df['bullish_cross'] = ((df['SMA_20'] > df['SMA_50']) & 
                          (df['SMA_20'].shift(1) <= df['SMA_50'].shift(1))).astype(int)
    
    # Bearish Cross: 20-day SMA crosses below 50-day SMA (Short-term Bearish)
    df['bearish_cross'] = ((df['SMA_20'] < df['SMA_50']) & 
                          (df['SMA_20'].shift(1) >= df['SMA_50'].shift(1))).astype(int)
    
    # EMA Bullish Cross: 12-day EMA crosses above 26-day EMA (Momentum turning up)
    df['ema_bullish_cross'] = ((df['EMA_12'] > df['EMA_26']) & 
                              (df['EMA_12'].shift(1) <= df['EMA_26'].shift(1))).astype(int)
    
    # EMA Bearish Cross: 12-day EMA crosses below 26-day EMA (Momentum turning down)
    df['ema_bearish_cross'] = ((df['EMA_12'] < df['EMA_26']) & 
                              (df['EMA_12'].shift(1) >= df['EMA_26'].shift(1))).astype(int)
    
    # 0 = Very Bearish, 5 = Very Bullish
    df['trend_strength'] = ((df[price_col] > df['SMA_20']).astype(int) + 
                           (df[price_col] > df['SMA_50']).astype(int) + 
                           (df[price_col] > df['SMA_200']).astype(int) +
                           (df[price_col] > df['EMA_12']).astype(int) +
                           (df[price_col] > df['EMA_26']).astype(int))
    
    # Price Distance from MAs
    # Positive = Above MA, Negative = Below MA
    df['price_sma20_dist'] = ((df[price_col] - df['SMA_20']) / df['SMA_20']).fillna(0)
    df['price_sma50_dist'] = ((df[price_col] - df['SMA_50']) / df['SMA_50']).fillna(0)
    df['price_sma200_dist'] = ((df[price_col] - df['SMA_200']) / df['SMA_200']).fillna(0)
    
    # Price Distance from EMAs
    # Positive = Above MA, Negative = Below MA
    df['price_ema12_dist'] = ((df[price_col] - df['EMA_12']) / df['EMA_12']).fillna(0)
    df['price_ema26_dist'] = ((df[price_col] - df['EMA_26']) / df['EMA_26']).fillna(0)
    
    # Bullish Alignment: SMA_20 > SMA_50 > SMA_200
    df['bullish_alignment'] = ((df['SMA_20'] > df['SMA_50']) & 
                              (df['SMA_50'] > df['SMA_200'])).astype(int)
    
    return df

## 3.2. Relative Strength Index (RSI)

https://www.investopedia.com/terms/r/rsi.asp
- บ่งบอกความแรงของราคา (Momentum) ในช่วงเวลาที่กำหนด ซึ่งมักนิยมใช้ 14 วัน
- มีค่าอยู่ในช่วงระหว่าง 0 ถึง 100

พิจารณา ETHBTC
- RSI สูง (>70) = คนซื้อ ETH ด้วย BTC เยอะมาก 
    - ETH อาจแพงเกินไปที่จะซื้อตอนนี้ 
    - ตอนนี้เราควรขาย ETH และซื้อ BTC
- RSI ต่ำ (<30) = คนขาย ETH เพื่อซื้อ BTC เยอะมาก 
    - ETH อาจถูกเกินไปที่จะขายตอนนี้ 
    - ตอนนี้เราควรซื้อ ETH และขาย BTC
- RSI กลางๆ (~50) = การซื้อขาย ETH/BTC ปกติดี  
    - ตอนนี้ควรรอดูสถานการณ์ก่อน

In [None]:
def add_rsi(df:pd.DataFrame, period=14):
    df = df.copy()
    
    price_col='close'
    
    delta = df[price_col].diff()
    
    gains = delta.where(delta > 0, 0)
    losses = -delta.where(delta < 0, 0)
    
    avg_gains = gains.ewm(alpha=1/period, adjust=False).mean()
    avg_losses = losses.ewm(alpha=1/period, adjust=False).mean()
    
    # Calculate RSI
    rs = avg_gains / avg_losses
    rsi = 100 - (100 / (1 + rs))

    df['rsi'] = rsi
    
    return df

## 3.3. MACD

- ใช้ดูแนวโน้ม (trend) และโมเมนตัม (momentum) ของราคา 
- ดูจากความแตกต่างของค่าเฉลี่ยเคลื่อนที่แบบ EMA สองเส้น (fast: EMA12 ลบกับ slow: EMA26)

พิจารณา ETHBTC
- MACD > 0 แสดงว่าราคากำลังขึ้น หรือก็คือมีแนวโน้มว่าในอนาคต BTC จะมีราคาสูงขึ้น
    - ถ้าเราถือ ETH เราควรถือไว้ รอขายในอนาคต
    - ถ้าเราถือ BTC เราควรขายเพื่อซื้อ ETH
- MACD < 0 แสดงว่าราคากำลังลง หรือก็คือมีแนวโน้มว่าในอนาคต BTC จะมีราคาต่ำขึ้น
    - ถ้าเราถือ ETH เราควรขายเพื่อซื้อ BTC
    - ถ้าเราถือ BTC เราควรถือไว้ รอขายในอนาคต

** ยิ่งค่าห่างจาก 0 ยิ่งมีแนวโน้มที่จะไปทางนั้นๆ สูง

- จังหวะที่ควรซื้อ หรือขาย คือจังหวะที่เส้นของ MACD ตัดกับเส้น MACD_Signal
    - MACD ตัดแล้วขึ้นสูงกว่า MACD_Signal: เป็นช่วงราคากำลังขึ้น
    - MACD ตัดแล้วต่ำกว่า MACD_Signal: เป็นช่วงราคากำลังลง

In [None]:
def add_macd(df, fast=12, slow=26, signal=9):
    # Calculate EMA fast and slow
    df['ema_fast'] = df['close'].ewm(span=fast, adjust=False).mean()
    df['ema_slow'] = df['close'].ewm(span=slow, adjust=False).mean()
    
    # MACD line
    df['macd'] = df['ema_fast'] - df['ema_slow']
    
    # Signal line
    df['macd_signal'] = df['macd'].ewm(span=signal, adjust=False).mean()
    
    return df

## 3.4. Bollinger Bands

- ประกอบด้วย 3 เส้นหลัก:

1. เส้นกลาง (Middle Band) : SMA ของ close หมายถึงแนวโน้มราคากลาง ๆ ในช่วงเวลาที่กำหนด

2. เส้นบน (Upper Band): SMA + 2SD บ่งบอกขอบเขตราคาที่ "สูงกว่าปกติ" หรือเป็นระดับแนวต้าน

3. เส้นล่าง (Lower Band): SMA - 2SD บ่งบอกขอบเขตราคาที่ "ต่ำกว่าปกติ" หรือเป็นระดับแนวรับ

พิจารณา ETHBTC
- มี 4 Case ที่เกิดขึ้นได้
    1. Close Price ชิด Upper Band: ราคาสูงเกินไปแล้ว อาจมีโอกาสราคาตกลงในเร็ว ๆ นี้
        - ถ้าเราถือ ETH ควรขายเพื่อซื้อ BTC
        - ถ้าเราถือ BTC ควรถือไว้ รอขายในอนาคต
    2. Close Price ชิด Lower Band: ราคาต่ำเกินไปแล้ว อาจมีโอกาสราคาขึ้นในเร็ว ๆ นี้
        - ถ้าเราถือ ETH ควรถือไว้ รอขายในอนาคต
        - ถ้าเราถือ BTC ควรขายเพื่อซื้อ ETH
    3. Upper Band กับ Lower Band เข้ามาชิดกัน: ความผันผวนต่ำ เตรียมเคลื่อนไหว
        - ถ้า Close Price สูงกว่า Upper Band อาจเป็นสัญญาณซื้อ ETH ขาย BTC
        - ถ้า Close Price ต่ำกว่า Lower Band อาจเป็นสัญญาณขาย ETH ซื้อ BTC
    4. Upper Band กับ Lower Band ห่างออกจากกัน: ความผันผวนสูง
        - ถ้า Close Price สูงขึ้น และอยู่ใกล้ Upper Band ควรถือ ETH ต่อเนื่อง หรือซื้อเพิ่ม
        - ถ้า Close Price ลดลง และอยู่ใกล้ Lower Band → ควรขาย ETH ซื้อ BTC

In [None]:
def add_bollinger(df, period=20, std_dev=2):
    df['bb_middle'] = df['close'].rolling(window=period).mean()
    df['bb_std'] = df['close'].rolling(window=period).std()
    
    df['bb_upper'] = df['bb_middle'] + std_dev * df['bb_std']
    df['bb_lower'] = df['bb_middle'] - std_dev * df['bb_std']

    return df

In [None]:
def add_all_indicators(df):
    df = df.copy() 
    df = add_moving_average(df)
    df = add_rsi(df)
    df = add_macd(df)
    df = add_bollinger(df)
    return df

# 4. Create Model

## EDA

In [None]:
ind_train_df = add_all_indicators(train_df)
ind_train_df.set_index("timestamp", inplace=True)

ind_test_df = add_all_indicators(test_df)
ind_test_df.set_index("timestamp", inplace=True)

In [None]:
ind_train_df

In [None]:
ind_test_df

In [None]:
ind_train_df.info()

In [None]:
ind_train_df.isna().sum()

In [None]:
print("bb_middle\n", ind_train_df["bb_middle"])
print("bb_std\n", ind_train_df["bb_std"])
print("bb_upper\n", ind_train_df["bb_upper"])
print("bb_lower\n", ind_train_df["bb_lower"])
print("rsi\n", ind_train_df["rsi"])

In [None]:
ind_train_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)

In [None]:
ind_train_df.isna().sum()

In [None]:
ind_train_df['price_direction'] = np.where(
    ind_train_df['close'].shift(-1) < ind_train_df['close'], -1, 1
)
ind_train_df['price_direction'].value_counts(normalize=True)

- -1 คือ ราคาปิดในอนาคต ***น้อยกว่า*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรขาย
- 1 คือ ราคาปิดในอนาคต ***มากกว่าหรือเท่ากับ*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรซื้อ

In [None]:
ind_test_df.info()

In [None]:
ind_test_df.isna().sum()

In [None]:
print("bb_middle\n", ind_test_df["bb_middle"])
print("bb_std\n", ind_test_df["bb_std"])
print("bb_upper\n", ind_test_df["bb_upper"])
print("bb_lower\n", ind_test_df["bb_lower"])
print("rsi\n", ind_test_df["rsi"])

In [None]:
ind_test_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)

In [None]:
ind_test_df.isna().sum()

In [None]:
ind_test_df['price_direction'] = np.where(
    ind_test_df['close'].shift(-1) < ind_test_df['close'], -1, 1
)
ind_test_df['price_direction'].value_counts(normalize=True)

- -1 คือ ราคาปิดในอนาคต ***น้อยกว่า*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรขาย
- 1 คือ ราคาปิดในอนาคต ***มากกว่าหรือเท่ากับ*** ราคาปิดปัจจุบัน -> ในตอนนี้ควรซื้อ

## Preparing to Train Model

In [None]:
ind_train_df.columns

In [None]:
ind_train_df['price_direction'].value_counts(normalize=True)

In [None]:
import multiprocessing

n_cores = multiprocessing.cpu_count()
optimal_jobs = max(1, n_cores // 3)

random_state = 2025

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

features = [
    'SMA_20', 'SMA_50', 'SMA_200',
    'EMA_12', 'EMA_26',
    'rsi', 
    'macd', 'macd_signal',
    'bb_upper', 'bb_middle', 'bb_lower',
    'trend_strength', 'price_sma20_dist', 'price_sma50_dist',
    'price_sma200_dist',
    'price_ema12_dist', 'price_ema26_dist', 'bullish_alignment'
]

target = 'price_direction'  

In [None]:
def model_evaluation(model, features, target, X_test, y_test):
    y_pred = model.predict(X_test)
    
    # แปลงค่า prediction
    if set(np.unique(y_test)).issubset({-1, 1}) and set(np.unique(y_pred)).issubset({0, 1}):
        y_pred = np.where(y_pred == 0, -1, 1)
        
    if set(np.unique(y_test)).issubset({0, 1}):
        y_pred = np.where(y_pred == 0, -1, 1)
        y_test = np.where(y_test == 0, -1, 1)
    
    # ตรวจสอบ prediction diversity
    unique_preds = np.unique(y_pred)
    
    # ตรวจสอบ class distribution ใน predictions
    pred_counts = pd.Series(y_pred).value_counts(normalize=True)
    min_pred_ratio = pred_counts.min()
        
    # คำนวณ metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    precision_all = precision_score(y_test, y_pred, average=None, labels=[-1, 1], zero_division=0)
    recall_all = recall_score(y_test, y_pred, average=None, labels=[-1, 1], zero_division=0)
    f1_all = f1_score(y_test, y_pred, average=None, labels=[-1, 1], zero_division=0)
    
    precision_sell = precision_all[0] if len(precision_all) > 0 else 0
    precision_buy = precision_all[1] if len(precision_all) > 1 else 0
    recall_sell = recall_all[0] if len(recall_all) > 0 else 0
    recall_buy = recall_all[1] if len(recall_all) > 1 else 0
    f1_sell = f1_all[0] if len(f1_all) > 0 else 0
    f1_buy = f1_all[1] if len(f1_all) > 1 else 0
    
    # ROC AUC
    try:
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:, 1]
        else:
            y_score = model.decision_function(X_test)
        roc_auc = roc_auc_score(y_test, y_score)
    except:
        roc_auc = 0.5
    
    return {
        "model": model,
        "features": features,
        "target": target,
        "accuracy": accuracy,
        "precision_sell": precision_sell,
        "precision_buy": precision_buy,  
        "recall_sell": recall_sell,
        "recall_buy": recall_buy,
        "f1_sell": f1_sell,
        "f1_buy": f1_buy,
        "roc_auc": roc_auc,
        "prediction_diversity": len(unique_preds),
        "min_pred_ratio": min_pred_ratio
    }

In [None]:
def show_model_results(lr_evaluation, rf_evaluation, xgb_evaluation, knn_evaluation, svm_evaluation): 
    data = {
        "model": ["LR", "RF", "XGB", "KNN", "SVM"],
        "accuracy": [lr_evaluation["accuracy"], rf_evaluation["accuracy"], 
                    xgb_evaluation["accuracy"], knn_evaluation["accuracy"], 
                    svm_evaluation["accuracy"]],
        "precision_sell": [lr_evaluation["precision_sell"], rf_evaluation["precision_sell"],
                            xgb_evaluation["precision_sell"], knn_evaluation["precision_sell"], 
                            svm_evaluation["precision_sell"]],
        "precision_buy": [lr_evaluation["precision_buy"], rf_evaluation["precision_buy"],
                            xgb_evaluation["precision_buy"], knn_evaluation["precision_buy"],
                            svm_evaluation["precision_buy"]],
        "recall_sell": [lr_evaluation["recall_sell"], rf_evaluation["recall_sell"],
                            xgb_evaluation["recall_sell"], knn_evaluation["recall_sell"],
                            svm_evaluation["recall_sell"]],
        "recall_buy": [lr_evaluation["recall_buy"], rf_evaluation["recall_buy"],
                            xgb_evaluation["recall_buy"], knn_evaluation["recall_buy"],
                            svm_evaluation["recall_buy"]],
        "f1_sell": [lr_evaluation["f1_sell"], rf_evaluation["f1_sell"],
                        xgb_evaluation["f1_sell"], knn_evaluation["f1_sell"],
                        svm_evaluation["f1_sell"]],
        "f1_buy": [lr_evaluation["f1_buy"], rf_evaluation["f1_buy"],
                        xgb_evaluation["f1_buy"], knn_evaluation["f1_buy"],
                        svm_evaluation["f1_buy"]],
        "roc_auc": [lr_evaluation["roc_auc"], rf_evaluation["roc_auc"],
                    xgb_evaluation["roc_auc"], knn_evaluation["roc_auc"],
                    svm_evaluation["roc_auc"]],
        # เพิ่ม columns ที่ขาดหายไป
        "prediction_diversity": [lr_evaluation["prediction_diversity"], rf_evaluation["prediction_diversity"],
                                xgb_evaluation["prediction_diversity"], knn_evaluation["prediction_diversity"],
                                svm_evaluation["prediction_diversity"]],
        "min_pred_ratio": [lr_evaluation["min_pred_ratio"], rf_evaluation["min_pred_ratio"],
                          xgb_evaluation["min_pred_ratio"], knn_evaluation["min_pred_ratio"],
                          svm_evaluation["min_pred_ratio"]]
    }
    
    results_df = pd.DataFrame(data)
    results_df.set_index("model", inplace=True)
    return results_df, {"LR": lr_evaluation["features"], "RF": rf_evaluation["features"],
                        "XGB": xgb_evaluation["features"], "KNN": knn_evaluation["features"],
                        "SVM": svm_evaluation["features"]}

In [None]:
def custom_train_test_split(X, y, test_size=0.2):    
    min_test_size = max(1, int(len(X) * 0.1))
    actual_test_size = min(test_size, min_test_size / len(X))
    
    return train_test_split(X, y, test_size=actual_test_size, shuffle=False, random_state=random_state)

## Model Training V1

### Logistic Regression

In [None]:
def lr_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train model
    model = LogisticRegression(random_state=random_state, class_weight='balanced', max_iter=1000)
    model.fit(X_scaled, y)

    # 3. Return model และ scaler
    return model, scaler

### Random Forest Classification

In [None]:
def rf_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train model
    model = RandomForestClassifier(random_state=random_state, class_weight='balanced')
    model.fit(X_scaled, y)

    # 3. Return model และ scaler
    return model, scaler

### XGBoost

In [None]:
def xgb_v1(df):
    X = df[features]
    y = df[target].map({-1: 0, 1: 1})
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train model บนข้อมูลทั้งหมด
    scale_pos_weight = (y == 0).sum() / (y == 1).sum()
    model = XGBClassifier(random_state=random_state, scale_pos_weight=scale_pos_weight)
    model.fit(X_scaled, y)

    # 3. Return model และ scaler
    return model, scaler

### K-Nearest Neighbors

In [None]:
def knn_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train model บนข้อมูลทั้งหมด
    model = KNeighborsClassifier(weights='distance')
    model.fit(X_scaled, y)

    # 3. Return model และ scaler
    return model, scaler

### Support Vector Machine

In [None]:
def svm_v1(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Train model บนข้อมูลทั้งหมด
    model = SVC(random_state=random_state, class_weight='balanced')
    model.fit(X_scaled, y)

    # 3. Return model และ scaler
    return model, scaler

## Model Summary V1

In [None]:
lr_model_v1_result, lr_scaler_v1_result = lr_v1(ind_train_df)
rf_model_v1_result, rf_scaler_v1_result = rf_v1(ind_train_df)
xgb_model_v1_result, xgb_scaler_v1_result = xgb_v1(ind_train_df)
knn_model_v1_result, knn_scaler_v1_result = knn_v1(ind_train_df)
svm_model_v1_result, svm_scaler_v1_result = svm_v1(ind_train_df)

X_test = ind_test_df[features]
y_test = ind_test_df[target]

X_test_lr = lr_scaler_v1_result.transform(X_test)
X_test_rf = rf_scaler_v1_result.transform(X_test)
X_test_xgb = xgb_scaler_v1_result.transform(X_test)
X_test_knn = knn_scaler_v1_result.transform(X_test)
X_test_svm = svm_scaler_v1_result.transform(X_test)

y_test_xgb = y_test.map({-1: 0, 1: 1})

lr_evaluation = model_evaluation(lr_model_v1_result, features, target, X_test_lr, y_test)
rf_evaluation = model_evaluation(rf_model_v1_result, features, target, X_test_rf, y_test)
xgb_evaluation = model_evaluation(xgb_model_v1_result, features, target, X_test_xgb, y_test_xgb)
knn_evaluation = model_evaluation(knn_model_v1_result, features, target, X_test_knn, y_test)
svm_evaluation = model_evaluation(svm_model_v1_result, features, target, X_test_svm, y_test)

results_df_v1, result_features_v1 = show_model_results(lr_evaluation, rf_evaluation, xgb_evaluation, 
                                                       knn_evaluation, svm_evaluation)

results_df_v1

## Model Training V2


- เพิ่ม Hyperparameter Tuning 
- เพิ่ม SMOTE

In [None]:
from collections import Counter

features = [
    'SMA_20', 'SMA_50', 'SMA_200',
    'EMA_12', 'EMA_26',
    'rsi', 
    'macd', 'macd_signal',
    'bb_upper', 'bb_middle', 'bb_lower',
    'trend_strength', 'price_sma20_dist', 'price_sma50_dist',
    'price_sma200_dist',
    'price_ema12_dist', 'price_ema26_dist', 'bullish_alignment'
]

target = 'price_direction' 

In [None]:
def custom_smote(X_train, y_train):
    try:
        counter = Counter(y_train)
        min_class_count = min(counter.values())
        max_class_count = max(counter.values())
        imbalance_ratio = max_class_count / min_class_count
        
        if imbalance_ratio > 2.0 and min_class_count >= 10:  
            k_neighbors = min(3, min_class_count - 1)
            sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
            X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
            return X_resampled, y_resampled
        else:
            return X_train, y_train
    except Exception as e:
        return X_train, y_train

### Logistic Regression

In [None]:
def lr_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE)
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear'],
        'class_weight': ['balanced']
    }
    grid = GridSearchCV(
        LogisticRegression(max_iter=1000, random_state=random_state), 
        param_grid, 
        n_jobs=optimal_jobs,
        cv=3,
        scoring='accuracy'
    )
    grid.fit(X_res, y_res)

    # 4. Train final model บนข้อมูลทั้งหมด
    model = LogisticRegression(max_iter=1000, **grid.best_params_, random_state=random_state)
    model.fit(X_res, y_res)

    # 5. Return model และ scaler
    return model, scaler

### Random Forest Regression

In [None]:
def rf_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE)
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [4, 8, 16, None],
        'min_samples_split': [2, 5, 10],
        'class_weight': ['balanced']
    }
    grid = GridSearchCV(
        RandomForestClassifier(random_state=random_state), 
        param_grid,
        n_jobs=optimal_jobs,
        cv=3,
        scoring='accuracy'
    )
    grid.fit(X_res, y_res)

    # 4. Train final model บนข้อมูลทั้งหมด
    model = RandomForestClassifier(random_state=random_state, **grid.best_params_)
    model.fit(X_res, y_res)

    # 5. Return model และ scaler
    return model, scaler

### XGBoost

In [None]:
def xgb_v2(df):
    X = df[features]
    y = df[target].map({-1: 0, 1: 1})
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE)
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
    }
    grid = GridSearchCV(
        XGBClassifier(eval_metric='logloss', random_state=random_state),
        param_grid, 
        n_jobs=optimal_jobs,
        cv=3,
        scoring='accuracy'
    )
    grid.fit(X_res, y_res)

    # 4. Train final model บนข้อมูลทั้งหมด
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        random_state=random_state, **grid.best_params_)
    model.fit(X_res, y_res)

    # 5. Return model และ scaler
    return model, scaler

### K-Nearest Neighbors

In [None]:
def knn_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE)
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    }
    grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid, 
                        n_jobs=optimal_jobs,
                        cv=3,
                        scoring='accuracy')
    grid.fit(X_res, y_res)

    # 4. Train final model บนข้อมูลทั้งหมด
    model = KNeighborsClassifier(**grid.best_params_)
    model.fit(X_res, y_res)

    # 5. Return model และ scaler
    return model, scaler

### Support Vector Machine

In [None]:
def svm_v2(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE)
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'C': [1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'],
        'class_weight': ['balanced']
    }
    grid = GridSearchCV(SVC(), 
                        param_grid, 
                        n_jobs=optimal_jobs,
                        cv=3,
                        scoring='accuracy')
    grid.fit(X_res, y_res)

    # 4. Train final model บนข้อมูลทั้งหมด
    model = SVC(**grid.best_params_, random_state=random_state)
    model.fit(X_res, y_res)

    # 5. Return model และ scaler
    return model, scaler

## Model Result V2

In [None]:
lr_model_v2_result, lr_scaler_v2_result = lr_v2(ind_train_df)
rf_model_v2_result, rf_scaler_v2_result = rf_v2(ind_train_df)
xgb_model_v2_result, xgb_scaler_v2_result = xgb_v2(ind_train_df)
knn_model_v2_result, knn_scaler_v2_result = knn_v2(ind_train_df)
svm_model_v2_result, svm_scaler_v2_result = svm_v2(ind_train_df)

X_test = ind_test_df[features]
y_test = ind_test_df[target]

X_test_lr = lr_scaler_v2_result.transform(X_test)
X_test_rf = rf_scaler_v2_result.transform(X_test)
X_test_xgb = xgb_scaler_v2_result.transform(X_test)
X_test_knn = knn_scaler_v2_result.transform(X_test)
X_test_svm = svm_scaler_v2_result.transform(X_test)

y_test_xgb = y_test.map({-1: 0, 1: 1})

lr_evaluation_v2 = model_evaluation(lr_model_v2_result, features, target, X_test_lr, y_test)
rf_evaluation_v2 = model_evaluation(rf_model_v2_result, features, target, X_test_rf, y_test)
xgb_evaluation_v2 = model_evaluation(xgb_model_v2_result, features, target, X_test_xgb, y_test_xgb)
knn_evaluation_v2 = model_evaluation(knn_model_v2_result, features, target, X_test_knn, y_test)
svm_evaluation_v2 = model_evaluation(svm_model_v2_result, features, target, X_test_svm, y_test)

results_df_v2, result_features_v2 = show_model_results(lr_evaluation_v2, 
                                                       rf_evaluation_v2, xgb_evaluation_v2, 
                                                       knn_evaluation_v2, svm_evaluation_v2)

results_df_v2

## Model Training V3

- เพิ่มการทำ Dynamic Features Selection
- เพิ่ม Class Weight Hyperparameter

In [None]:
features = [
    'SMA_20', 'SMA_50', 'SMA_200',
    'EMA_12', 'EMA_26',
    'rsi', 
    'macd', 'macd_signal',
    'bb_upper', 'bb_middle', 'bb_lower',
    'trend_strength', 'price_sma20_dist', 'price_sma50_dist',
    'price_sma200_dist',
    'price_ema12_dist', 'price_ema26_dist', 'bullish_alignment'
]

target = 'price_direction' 

In [None]:
def get_top_features_by_coef(model, top_n=10):
    coef_abs = np.abs(model.coef_[0])
    feature_importance = pd.Series(coef_abs, index=features)
    top_feature_importance = feature_importance.sort_values(ascending=False).head(top_n)
    return top_feature_importance.index.tolist()

In [None]:
def get_top_features_by_importance(model, top_n=10):
    importances = model.feature_importances_
    feature_importance = pd.Series(importances, index=features)
    top_feature_importance = feature_importance.sort_values(ascending=False).head(top_n)
    return top_feature_importance.index.tolist()

### Logistic Regression

In [None]:
def lr_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE) ก่อน feature selection
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Fit model ด้วย features ทั้งหมดเพื่อหา top features
    model_all = LogisticRegression(max_iter=1000, random_state=random_state, class_weight='balanced')
    model_all.fit(X_res, y_res)

    # 4. เลือก top N feature (เพิ่มจำนวน features)
    top_features = get_top_features_by_coef(model_all, top_n=min(15, len(features)))

    # 5. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    scaler_top = StandardScaler()  # ใช้ scaler ใหม่
    X_top_scaled = scaler_top.fit_transform(X_top)

    # 6. Balance Data (SMOTE) สำหรับ Top Features
    X_top_res, y_top_res = custom_smote(X_top_scaled, y)

    # 7. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear'],
        'class_weight': ['balanced', {-1: 2, 1: 1}, {-1: 1, 1: 2}]
    }
    
    try:
        grid = GridSearchCV(
            LogisticRegression(max_iter=1000, random_state=random_state), 
            param_grid, 
            n_jobs=optimal_jobs,
            cv=3,
            scoring='accuracy'
        )
        grid.fit(X_top_res, y_top_res)
        best_params = grid.best_params_
    except:
        best_params = {'C': 1, 'solver': 'lbfgs', 'class_weight': 'balanced'}

    # 8. Train final model
    model = LogisticRegression(max_iter=1000, **best_params, random_state=random_state)
    model.fit(X_top_res, y_top_res)

    # 9. Return model, scaler, และ top_features
    return model, scaler_top, top_features

### Random Forest Classification

In [None]:
def rf_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE) ก่อน feature selection
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Fit model ด้วย features ทั้งหมดเพื่อหา top features
    model_all = RandomForestClassifier(random_state=random_state, class_weight='balanced')
    model_all.fit(X_res, y_res)

    # 4. เลือก top N feature (เพิ่มจำนวน features)
    top_features = get_top_features_by_importance(model_all, top_n=min(15, len(features)))

    # 5. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    scaler_top = StandardScaler()  # ใช้ scaler ใหม่
    X_top_scaled = scaler_top.fit_transform(X_top)

    # 6. Balance Data (SMOTE) สำหรับ Top Features
    X_top_res, y_top_res = custom_smote(X_top_scaled, y)

    # 7. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [4, 8, 16, None],
        'min_samples_split': [2, 5, 10],
        'class_weight': ['balanced', {-1: 2, 1: 1}, {-1: 1, 1: 2}]
    }
    
    try:
        grid = GridSearchCV(
            RandomForestClassifier(random_state=random_state),
            param_grid, 
            n_jobs=optimal_jobs,
            cv=3,
            scoring='accuracy'
        )
        grid.fit(X_top_res, y_top_res)
        best_params = grid.best_params_
    except:
        best_params = {'n_estimators': 100, 'max_depth': 8, 'min_samples_split': 5, 'class_weight': 'balanced'}

    # 8. Train final model
    model = RandomForestClassifier(random_state=random_state, **best_params)
    model.fit(X_top_res, y_top_res)
    
    # 9. Return model, scaler, และ top_features
    return model, scaler_top, top_features

### XGBoost

In [None]:
def xgb_v3(df):
    X = df[features]
    y = df[target].map({-1: 0, 1: 1})
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE) ก่อน feature selection
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Fit model ด้วย features ทั้งหมดเพื่อหา top features
    model_all = XGBClassifier(eval_metric='logloss', random_state=random_state)
    model_all.fit(X_res, y_res)

    # 4. เลือก top N feature (เพิ่มจำนวน features)
    top_n = min(15, len(features))
    top_features = get_top_features_by_importance(model_all, top_n=top_n)

    # 5. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    scaler_top = StandardScaler()  # ใช้ scaler ใหม่
    X_top_scaled = scaler_top.fit_transform(X_top)

    # 6. Balance Data (SMOTE) สำหรับ Top Features
    X_top_res, y_top_res = custom_smote(X_top_scaled, y)

    # 7. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
    }
    
    try:
        grid = GridSearchCV(
            XGBClassifier(eval_metric='logloss', random_state=random_state),
            param_grid, 
            n_jobs=optimal_jobs,
            cv=3,
            scoring='accuracy'
        )
        grid.fit(X_top_res, y_top_res)
        best_params = grid.best_params_
    except:
        best_params = {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.8}

    # 8. Train final model
    model = XGBClassifier(eval_metric='logloss', random_state=random_state, **best_params)
    model.fit(X_top_res, y_top_res)

    # 9. Return model, scaler, และ top_features
    return model, scaler_top, top_features

### K-Nearest Neighbors

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

def knn_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE) ก่อน feature selection
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Feature Selection ด้วย SelectKBest
    top_n = min(15, len(features))
    selector = SelectKBest(score_func=f_classif, k=top_n)
    X_selected = selector.fit_transform(X_res, y_res)
    mask = selector.get_support()
    top_features = [feature for feature, selected in zip(features, mask) if selected]

    # 4. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    scaler_top = StandardScaler()  # ใช้ scaler ใหม่
    X_top_scaled = scaler_top.fit_transform(X_top)

    # 5. Balance Data (SMOTE) สำหรับ Top Features
    X_top_res, y_top_res = custom_smote(X_top_scaled, y)

    # 6. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],        
    }
    
    try:
        grid = GridSearchCV(KNeighborsClassifier(), 
                            param_grid,
                            n_jobs=optimal_jobs,
                            cv=3,
                            scoring='accuracy')
        grid.fit(X_top_res, y_top_res)
        best_params = grid.best_params_
    except:
        best_params = {'n_neighbors': 5, 'weights': 'uniform', 'metric': 'euclidean'}

    # 7. Train final model
    model = KNeighborsClassifier(**best_params)
    model.fit(X_top_res, y_top_res)

    # 8. Return model, scaler, และ top_features
    return model, scaler_top, top_features

### Support Vector Machine

In [None]:
def svm_v3(df):
    X = df[features]
    y = df[target]
    
    # 1. Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Balance Data (SMOTE) ก่อน feature selection
    X_res, y_res = custom_smote(X_scaled, y)

    # 3. Fit model ด้วย features ทั้งหมดเพื่อหา top features
    model_all = SVC(kernel='linear', class_weight='balanced', random_state=random_state)
    model_all.fit(X_res, y_res)

    # 4. เลือก top N feature (เพิ่มจำนวน features)
    top_features = get_top_features_by_coef(model_all, top_n=min(15, len(features)))

    # 5. เตรียมข้อมูลใหม่เฉพาะ top feature
    X_top = df[top_features]
    scaler_top = StandardScaler()  # ใช้ scaler ใหม่
    X_top_scaled = scaler_top.fit_transform(X_top)

    # 6. Balance Data (SMOTE) สำหรับ Top Features
    X_top_res, y_top_res = custom_smote(X_top_scaled, y)

    # 7. Hyperparameter Tuning with Cross-Validation
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'class_weight': ['balanced', {-1: 2, 1: 1}, {-1: 1, 1: 2}]
    }
    
    try:
        grid = GridSearchCV(SVC(random_state=random_state), 
                            param_grid, 
                            n_jobs=optimal_jobs,
                            cv=3,
                            scoring='accuracy')
        grid.fit(X_top_res, y_top_res)
        best_params = grid.best_params_
    except:
        best_params = {'C': 1, 'kernel': 'rbf', 'gamma': 'scale', 'class_weight': 'balanced'}

    # 8. Train final model
    model = SVC(**best_params, random_state=random_state)
    model.fit(X_top_res, y_top_res)

    # 9. Return model, scaler, และ top_features
    return model, scaler_top, top_features

## Model Result V3

In [None]:
lr_model_v3_result, lr_scaler_v3_result, lr_features_v3 = lr_v3(ind_train_df)
rf_model_v3_result, rf_scaler_v3_result, rf_features_v3 = rf_v3(ind_train_df)
xgb_model_v3_result, xgb_scaler_v3_result, xgb_features_v3 = xgb_v3(ind_train_df)
knn_model_v3_result, knn_scaler_v3_result, knn_features_v3 = knn_v3(ind_train_df)
svm_model_v3_result, svm_scaler_v3_result, svm_features_v3 = svm_v3(ind_train_df)

y_test = ind_test_df[target]

X_test_lr = ind_test_df[lr_features_v3]
X_test_lr_scaled = lr_scaler_v3_result.transform(X_test_lr)

X_test_rf = ind_test_df[rf_features_v3]
X_test_rf_scaled = rf_scaler_v3_result.transform(X_test_rf)

X_test_xgb = ind_test_df[xgb_features_v3]
X_test_xgb_scaled = xgb_scaler_v3_result.transform(X_test_xgb)

X_test_knn = ind_test_df[knn_features_v3]
X_test_knn_scaled = knn_scaler_v3_result.transform(X_test_knn)

X_test_svm = ind_test_df[svm_features_v3]
X_test_svm_scaled = svm_scaler_v3_result.transform(X_test_svm)

y_test_xgb = y_test.map({-1: 0, 1: 1})

lr_evaluation_v3 = model_evaluation(lr_model_v3_result, lr_features_v3, target, X_test_lr_scaled, y_test)
rf_evaluation_v3 = model_evaluation(rf_model_v3_result, rf_features_v3, target, X_test_rf_scaled, y_test)
xgb_evaluation_v3 = model_evaluation(xgb_model_v3_result, xgb_features_v3, target, X_test_xgb_scaled, y_test_xgb)
knn_evaluation_v3 = model_evaluation(knn_model_v3_result, knn_features_v3, target, X_test_knn_scaled, y_test)
svm_evaluation_v3 = model_evaluation(svm_model_v3_result, svm_features_v3, target, X_test_svm_scaled, y_test)

results_df_v3, result_features_v3 = show_model_results(lr_evaluation_v3, rf_evaluation_v3, 
                                                       xgb_evaluation_v3, knn_evaluation_v3, 
                                                       svm_evaluation_v3)

results_df_v3

## Comparison

In [None]:
metrics = ['accuracy', 'precision_sell', 'precision_buy', 'recall_sell', 
           'recall_buy', 'f1_sell', 'f1_buy', 'roc_auc']

models = results_df_v1.index.tolist()
x = np.arange(len(models))
width = 0.25

fig, axes = plt.subplots(len(metrics), 1, figsize=(14, 4 * len(metrics)), sharey=True)

for i, metric in enumerate(metrics):
    axes[i].bar(x - width, results_df_v1[metric], width, label='V1')
    axes[i].bar(x, results_df_v2[metric], width, label='V2')
    axes[i].bar(x + width, results_df_v3[metric], width, label='V3')
    axes[i].set_title(metric)
    axes[i].set_xticks(x)
    axes[i].set_xticklabels(models, rotation=20)
    axes[i].legend()
    
plt.tight_layout()
plt.show()

# 5. Picking Model

ทดสอบกับหลายๆ เหรียญ แล้วหา Model ที่ดีที่สุด

In [None]:
def create_train_test_ind_df(symbol):
    api = BinanceAPI(api_key, api_secret)
    train_df = collect_historical_data(api, symbol, interval="5m", days=4)
    test_df = collect_historical_data(api, symbol, interval="5m", days=1)
    test_df = test_df[test_df['timestamp'] > train_df['timestamp'].max()]
    
    ind_train_df = add_all_indicators(train_df)
    ind_train_df.set_index("timestamp", inplace=True)
    ind_train_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)
    ind_train_df['price_direction'] = np.where(
        ind_train_df['close'].shift(-1) < ind_train_df['close'], -1, 1
    )
    
    ind_test_df = add_all_indicators(test_df)
    ind_test_df.set_index("timestamp", inplace=True)
    ind_test_df.dropna(subset=['bb_lower', 'bb_upper', 'bb_middle', 'bb_std', 'rsi'], inplace=True)
    ind_test_df['price_direction'] = np.where(
        ind_test_df['close'].shift(-1) < ind_test_df['close'], -1, 1
    )
    
    return ind_train_df, ind_test_df

In [None]:
def select_best_model(eval_df):
    valid_models = eval_df[
        (eval_df['prediction_diversity'] >= 2) &
        (eval_df['min_pred_ratio'] >= 0.05) &
        (eval_df['accuracy'] > 0.45) &
        (eval_df['roc_auc'] > 0.5)
    ].copy()
    
    if len(valid_models) == 0:
        print("\n--- No models meet the first criteria! ---")
        
        relaxed_models = eval_df[
            (eval_df['prediction_diversity'] >= 2) &
            (eval_df['min_pred_ratio'] >= 0.01)
        ].copy()
        
        if len(relaxed_models) == 0:
            print("\nNo Model with prediction_diversity >= 2. So selecting model with highest accuracy...")
            return eval_df['accuracy'].idxmax()
        else:
            valid_models = relaxed_models
    
    print("\n=== Calculating Composite Scores ===")
    weights = {
        'accuracy': 0.15,           
        'balanced_f1': 0.30,        
        'balanced_recall': 0.25,    
        'roc_auc': 0.10,           
        'prediction_stability': 0.20
    }
    
    for idx, row in valid_models.iterrows():
        balanced_f1 = (row['f1_sell'] + row['f1_buy']) / 2
        balanced_recall = (row['recall_sell'] + row['recall_buy']) / 2
        
        prediction_stability = 1 - abs(row['min_pred_ratio'] - 0.5)
        
        composite_score = (
            weights['accuracy'] * row['accuracy'] +
            weights['balanced_f1'] * balanced_f1 +
            weights['balanced_recall'] * balanced_recall +
            weights['roc_auc'] * row['roc_auc'] +
            weights['prediction_stability'] * prediction_stability
        )
        
        valid_models.loc[idx, 'balanced_f1'] = balanced_f1
        valid_models.loc[idx, 'balanced_recall'] = balanced_recall
        valid_models.loc[idx, 'prediction_stability'] = prediction_stability
        valid_models.loc[idx, 'composite_score'] = composite_score
    
    best_model_name = valid_models['composite_score'].idxmax()
    best_score = valid_models.loc[best_model_name, 'composite_score']
    
    best_model_row = valid_models.loc[best_model_name]
    print(f"\n=== Selected Model Details ===")
    print(f"Best Model: {best_model_name}")
    print(f"Accuracy: {best_model_row['accuracy']:.4f}")
    print(f"Prediction Diversity: {best_model_row['prediction_diversity']}")
    print(f"Min Prediction Ratio: {best_model_row['min_pred_ratio']:.4f}")
    print(f"ROC AUC: {best_model_row['roc_auc']:.4f}")
    print(f"Balanced F1: {best_model_row['balanced_f1']:.4f}")
    print(f"Balanced Recall: {best_model_row['balanced_recall']:.4f}")
    
    return best_model_name

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def get_best_model(ind_train_df):
    if ind_train_df.shape[0] == 0:
        return None, None, None
    
    models = {}
    scalers = {}
    model_features = {}
    evaluations = {}
    
    print("\n=== Training All Models ===")
    
    # V1 Models
    lr_model_v1, lr_scaler_v1 = lr_v1(ind_train_df)
    models['LR_V1'] = lr_model_v1
    scalers['LR_V1'] = lr_scaler_v1
    model_features['LR_V1'] = features
    
    rf_model_v1, rf_scaler_v1 = rf_v1(ind_train_df)
    models['RF_V1'] = rf_model_v1
    scalers['RF_V1'] = rf_scaler_v1
    model_features['RF_V1'] = features
    
    xgb_model_v1, xgb_scaler_v1 = xgb_v1(ind_train_df)
    models['XGB_V1'] = xgb_model_v1
    scalers['XGB_V1'] = xgb_scaler_v1
    model_features['XGB_V1'] = features
    
    knn_model_v1, knn_scaler_v1 = knn_v1(ind_train_df)
    models['KNN_V1'] = knn_model_v1
    scalers['KNN_V1'] = knn_scaler_v1
    model_features['KNN_V1'] = features
    
    svm_model_v1, svm_scaler_v1 = svm_v1(ind_train_df)
    models['SVM_V1'] = svm_model_v1
    scalers['SVM_V1'] = svm_scaler_v1
    model_features['SVM_V1'] = features
    
    # V2 Models
    lr_model_v2, lr_scaler_v2 = lr_v2(ind_train_df)
    models['LR_V2'] = lr_model_v2
    scalers['LR_V2'] = lr_scaler_v2
    model_features['LR_V2'] = features
    
    rf_model_v2, rf_scaler_v2 = rf_v2(ind_train_df)
    models['RF_V2'] = rf_model_v2
    scalers['RF_V2'] = rf_scaler_v2
    model_features['RF_V2'] = features
    
    xgb_model_v2, xgb_scaler_v2 = xgb_v2(ind_train_df)
    models['XGB_V2'] = xgb_model_v2
    scalers['XGB_V2'] = xgb_scaler_v2
    model_features['XGB_V2'] = features
    
    knn_model_v2, knn_scaler_v2 = knn_v2(ind_train_df)
    models['KNN_V2'] = knn_model_v2
    scalers['KNN_V2'] = knn_scaler_v2
    model_features['KNN_V2'] = features
    
    svm_model_v2, svm_scaler_v2 = svm_v2(ind_train_df)
    models['SVM_V2'] = svm_model_v2
    scalers['SVM_V2'] = svm_scaler_v2
    model_features['SVM_V2'] = features
    
    # V3 Models
    lr_model_v3, lr_scaler_v3, lr_features_v3 = lr_v3(ind_train_df)
    models['LR_V3'] = lr_model_v3
    scalers['LR_V3'] = lr_scaler_v3
    model_features['LR_V3'] = lr_features_v3
    
    rf_model_v3, rf_scaler_v3, rf_features_v3 = rf_v3(ind_train_df)
    models['RF_V3'] = rf_model_v3
    scalers['RF_V3'] = rf_scaler_v3
    model_features['RF_V3'] = rf_features_v3
    
    xgb_model_v3, xgb_scaler_v3, xgb_features_v3 = xgb_v3(ind_train_df)
    models['XGB_V3'] = xgb_model_v3
    scalers['XGB_V3'] = xgb_scaler_v3
    model_features['XGB_V3'] = xgb_features_v3
    
    knn_model_v3, knn_scaler_v3, knn_features_v3 = knn_v3(ind_train_df)
    models['KNN_V3'] = knn_model_v3
    scalers['KNN_V3'] = knn_scaler_v3
    model_features['KNN_V3'] = knn_features_v3
    
    svm_model_v3, svm_scaler_v3, svm_features_v3 = svm_v3(ind_train_df)
    models['SVM_V3'] = svm_model_v3
    scalers['SVM_V3'] = svm_scaler_v3
    model_features['SVM_V3'] = svm_features_v3
    
    print("\n=== Cross-Validation Model Evaluation ===")
    
    eval_data = []
    for model_name, model in models.items():
        try:
            model_feature_list = model_features[model_name]
            X = ind_train_df[model_feature_list]
            y = ind_train_df[target]

            tscv = TimeSeriesSplit(n_splits=3)
            
            cv_accuracies = []
            cv_predictions = []
            cv_true_labels = []
            
            for train_idx, val_idx in tscv.split(X):
                X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
                y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]
                
                # Scale data
                scaler_cv = scalers[model_name].__class__()
                X_train_cv_scaled = scaler_cv.fit_transform(X_train_cv)
                X_val_cv_scaled = scaler_cv.transform(X_val_cv)
                
                # Train model
                model_cv = models[model_name].__class__(**models[model_name].get_params())
                
                if 'XGB' in model_name:
                    y_train_cv_model = y_train_cv.map({-1: 0, 1: 1})
                    y_val_cv_model = y_val_cv.map({-1: 0, 1: 1})
                else:
                    y_train_cv_model = y_train_cv
                    y_val_cv_model = y_val_cv
                
                model_cv.fit(X_train_cv_scaled, y_train_cv_model)
                y_pred_cv = model_cv.predict(X_val_cv_scaled)
                
                if 'XGB' in model_name:
                    y_pred_cv = np.where(y_pred_cv == 0, -1, 1)
                    y_val_cv_model = np.where(y_val_cv_model == 0, -1, 1)
                
                cv_accuracies.append(accuracy_score(y_val_cv_model, y_pred_cv))
                cv_predictions.extend(y_pred_cv)
                cv_true_labels.extend(y_val_cv_model)
            
            # คำนวณ metrics จาก CV results
            cv_predictions = np.array(cv_predictions)
            cv_true_labels = np.array(cv_true_labels)
            
            unique_preds = np.unique(cv_predictions)
            pred_counts = pd.Series(cv_predictions).value_counts(normalize=True)
            min_pred_ratio = pred_counts.min()
            
            avg_accuracy = np.mean(cv_accuracies)
            
            precision_all = precision_score(cv_true_labels, cv_predictions, average=None, labels=[-1, 1], zero_division=0)
            recall_all = recall_score(cv_true_labels, cv_predictions, average=None, labels=[-1, 1], zero_division=0)
            f1_all = f1_score(cv_true_labels, cv_predictions, average=None, labels=[-1, 1], zero_division=0)
            
            precision_sell = precision_all[0] if len(precision_all) > 0 else 0
            precision_buy = precision_all[1] if len(precision_all) > 1 else 0
            recall_sell = recall_all[0] if len(recall_all) > 0 else 0
            recall_buy = recall_all[1] if len(recall_all) > 1 else 0
            f1_sell = f1_all[0] if len(f1_all) > 0 else 0
            f1_buy = f1_all[1] if len(f1_all) > 1 else 0
            
            try:
                roc_auc = roc_auc_score(cv_true_labels, cv_predictions)
            except:
                roc_auc = 0.5
            
            eval_data.append({
                'model': model_name,
                'accuracy': avg_accuracy,
                'prediction_diversity': len(unique_preds),
                'min_pred_ratio': min_pred_ratio,
                'recall_sell': recall_sell,
                'recall_buy': recall_buy,
                'f1_sell': f1_sell,
                'f1_buy': f1_buy,
                'roc_auc': roc_auc
            })
            
            print(f"{model_name}: CV Accuracy={avg_accuracy:.4f}, "
                  f"Diversity={len(unique_preds)}, "
                  f"Min_ratio={min_pred_ratio:.4f}")
            
        except Exception as e:
            print(f"{model_name}: Error - {e}")
            eval_data.append({
                'model': model_name,
                'accuracy': 0.0,
                'prediction_diversity': 0,
                'min_pred_ratio': 0.0,
                'recall_sell': 0.0,
                'recall_buy': 0.0,
                'f1_sell': 0.0,
                'f1_buy': 0.0,
                'roc_auc': 0.5
            })
    
    eval_df = pd.DataFrame(eval_data)
    eval_df.set_index('model', inplace=True)
    
    print("\n=== Model Selection ===")
    best_model_name = select_best_model(eval_df)
    
    if best_model_name not in models:
        print(f"Selected model {best_model_name} not found, falling back to highest accuracy")
        best_model_name = eval_df['accuracy'].idxmax()
    
    best_model = models[best_model_name]
    best_scaler = scalers[best_model_name]
    best_features = model_features[best_model_name]
    
    return best_model, best_scaler, best_features

# 6. Simulation

In [None]:
def simulate(symbol=None):
    api = BinanceAPI(api_key, api_secret)
    
    while not symbol:
        print("Available symbols (Some might not work...):")
        for s in api.get_n_symbol(10):
            print(s)
    
        symbol = input("Enter a symbol: ").strip().upper()
        print(f"\nSelected symbol: {symbol}")

        ind_train_df, ind_test_df = create_train_test_ind_df(symbol)
        if ind_train_df.shape[0] == 0 or ind_test_df.shape[0] == 0:
            print(f"{symbol} is invalid or has insufficient data. Try again")
            symbol = None
            
    ind_train_df, ind_test_df = create_train_test_ind_df(symbol)
    print(f"\nTrain set size: {ind_train_df.shape}, Test set size: {ind_test_df.shape}")
    
    print("\nTrain target distribution:")
    print(ind_train_df['price_direction'].value_counts(normalize=True))
    print("\nTest target distribution:")
    print(ind_test_df['price_direction'].value_counts(normalize=True))

    #  Train with Train data
    best_model, best_scaler, best_features = get_best_model(ind_train_df)
    
    if best_model is None:
        print("No valid model!")
        return
    
    print(f"Best model: {type(best_model).__name__}")
    print(f"Features used: {best_features}")

    # Test with Test data
    print("\n=== Testing on Real Test Data ===")
    x_test = ind_test_df[best_features]
    y_test = ind_test_df['price_direction']
    
    x_test_scaled = best_scaler.transform(x_test)
    y_pred = best_model.predict(x_test_scaled)
    
    if hasattr(best_model, 'predict') and 'XGB' in str(type(best_model)):
        y_pred = np.where(y_pred == 0, -1, 1)

    # Evaluation
    print("\n=== Test Results ===")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"📊 Accuracy: {accuracy:.4f}")
    
    unique_preds = np.unique(y_pred)
    pred_counts = pd.Series(y_pred).value_counts(normalize=True)
    print(f"Prediction diversity: {len(unique_preds)} classes")
    print(f"Prediction distribution:")
    for pred, count in pred_counts.items():
        print(f"Class {pred}: {count:.2%}")
    
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Trading Simulation
    print("\n=== Trading Strategy Simulation ===")
    initial_balance = 1000
    balance = initial_balance
    position = 0
    trades = 0
    correct_predictions = 0
    
    trade_history = []
    
    for i in range(len(y_pred)):
        predicted_direction = y_pred[i]
        actual_direction = y_test.iloc[i]
        
        if predicted_direction == actual_direction:
            correct_predictions += 1
        
        if predicted_direction == 1 and position <= 0:  # Buy signal
            if position == -1: 
                trades += 1
                if actual_direction == 1:
                    balance -= 10
                else:  
                    balance += 10
            
            position = 1
            trades += 1
            
        elif predicted_direction == -1 and position >= 0:  # Sell signal
            if position == 1:
                trades += 1
                if actual_direction == 1:
                    balance += 10
                else: 
                    balance -= 10
            
            position = -1 
            trades += 1
    
    if position != 0:
        trades += 1
        if len(y_test) > 0:
            final_actual = y_test.iloc[-1]
            if (position == 1 and final_actual == 1) or (position == -1 and final_actual == -1):
                balance += 10
            else:
                balance -= 10
    
    total_return = (balance - initial_balance) / initial_balance * 100
    prediction_accuracy = (correct_predictions / len(y_pred) * 100) if len(y_pred) > 0 else 0
    
    print(f"💵 Initial balance: ${initial_balance}")
    print(f"💰 Final balance: ${balance:.2f}")
    print(f"📈 Total return: {total_return:.2f}%")
    print(f"🎯 Prediction accuracy: {prediction_accuracy:.2f}%")
    print(f"📊 Total trades: {trades}")
    
    # ✅ Additional Analysis
    print(f"\n=== Additional Analysis ===")
    print(f"🔮 Total predictions: {len(y_pred)}")
    print(f"✅ Correct predictions: {correct_predictions}")
    print(f"❌ Wrong predictions: {len(y_pred) - correct_predictions}")
    
    if len(unique_preds) < 2:
        print(f"⚠️ WARNING: Model only predicts {len(unique_preds)} class(es)")
        print(f"🔄 This indicates potential overfitting or poor model selection")
        print(f"📊 Model prediction distribution: {dict(pred_counts)}")

In [None]:
simulate("ETHBTC")