In [52]:
import pandas as pd
import json
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, classification_report
from imblearn.over_sampling import SMOTE
import numpy as np
import joblib

data = {}

def load_json(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        return json.load(file)


def preprocess_token_data(data: dict) -> pd.DataFrame:
    records = []
    for mint, token in data.items():
        for price_change in token.get("marketCapHistory", []):
            timestamp = price_change.get("timestamp")
            if timestamp > 10*10:
                timestamp = timestamp/1000
            records.append({
                "timestamp": timestamp,
                "marketCap": price_change.get("marketCap", 0),
                "symbol": token.get("symbol", "Unknown"),
                "volume": token.get("volume", 0),
                "numHolders": token.get("numHolders", 0),
                "sniperCount": token.get("sniperCount", 0),
                "progress": token.get("progress", 0),
                "buySellRatio": token.get("buySellRatio", 0),
                "liquidity": token.get("liquidity", 0),
                "volatility": token.get("volatility", 0)
            })

        for neo in token.get("Neo", []):
            records.append({
                "symbol": token.get("symbol", "Unknown"),
                "holderCounts": neo.get("holderCounts", 0),
                "tagsCount": neo.get("tagsCount", 0),
                "devHoldingSupplyPerc": neo.get("devHoldingSupplyPerc", 0),
                "insiderWalletsSupplyPerc": neo.get("insiderWalletsSupplyPerc", 0)
            })

        for trades in token.get("trades", []):
            records.append({
                "timestamp": trades.get("timestamp"),
                "symbol": token.get("symbol", "Unknown"),
                "solAmount": trades.get("solAmount", 0),
                "isBuy": trades.get("isBuy", 0),
            })

    return pd.DataFrame(records)


files = [
    'data/coiny_bez_limitu6.json',
    'data/coiny_bez_limitu7.json',
    'data/coiny_bez_limitu8.json',
    'data/coiny_bez_limitu9.json',
    'data/coiny_bez_limitu10.json',
    'data/coiny_bez_limitu11.json',
]

for file in files:
    data.update(load_json(file))

df = preprocess_token_data(data)
df

Unnamed: 0,timestamp,marketCap,symbol,volume,numHolders,sniperCount,progress,buySellRatio,liquidity,volatility,holderCounts,tagsCount,devHoldingSupplyPerc,insiderWalletsSupplyPerc,solAmount,isBuy
0,1.738860e+06,7887.522187,СОСКY,15.4587,21.0,10.0,24.06,0.0,0.0,0.0,,,,,,
1,1.738860e+06,7892.808009,СОСКY,15.4587,21.0,10.0,24.06,0.0,0.0,0.0,,,,,,
2,1.738860e+09,7885.040000,СОСКY,15.4587,21.0,10.0,24.06,0.0,0.0,0.0,,,,,,
3,1.738860e+06,7935.138594,СОСКY,15.4587,21.0,10.0,24.06,0.0,0.0,0.0,,,,,,
4,1.738860e+06,7973.608513,СОСКY,15.4587,21.0,10.0,24.06,0.0,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409964,1.739034e+09,,OFI,,,,,,,,,,,,2.500000e+09,True
409965,1.739034e+06,5432.958982,LCSE,0.0000,0.0,0.0,0.00,0.0,0.0,0.0,,,,,,
409966,1.739034e+09,,LCSE,,,,,,,,,,,,2.005390e+07,True
409967,1.739034e+06,5476.591540,ice,0.0000,0.0,0.0,0.00,0.0,0.0,0.0,,,,,,


In [53]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df.sort_values(['symbol', 'timestamp'], inplace=True)

df['marketCap_change'] = df.groupby('symbol')['marketCap'].pct_change().fillna(0)
df['marketCap_ema'] = df.groupby('symbol')['marketCap'].transform(lambda x: x.ewm(span=7).mean())
df['rsi'] = df.groupby('symbol')['marketCap'].transform(lambda x:
    100 - (100 / (1 + x.pct_change().fillna(0).rolling(window=7).mean() / x.pct_change().fillna(0).rolling(window=7).std())))

df['marketCap_change_ratio'] = (df['marketCap'] - df['marketCap'].shift(1)) / df['marketCap'].shift(1)
df['holders_growth'] = df['numHolders'].diff().fillna(0)
df['volatility_to_volume_ratio'] = df['volatility'] / (df['volume'] + 1)

## Normalization
scaler = StandardScaler()
features_to_scale = ['marketCap', 'volume', 'numHolders', 'volatility']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

df

Unnamed: 0,timestamp,marketCap,symbol,volume,numHolders,sniperCount,progress,buySellRatio,liquidity,volatility,...,devHoldingSupplyPerc,insiderWalletsSupplyPerc,solAmount,isBuy,marketCap_change,marketCap_ema,rsi,marketCap_change_ratio,holders_growth,volatility_to_volume_ratio
120353,1970-01-01 00:00:00.001738867,-0.684289,,-0.386521,-0.468673,2.0,8.19,0.0,0.0,0.0,...,,,,,0.000000,5922.552091,,,0.0,0.0
120354,1970-01-01 00:00:00.001738867,-0.684289,,-0.386521,-0.468673,2.0,8.19,0.0,0.0,0.0,...,,,,,0.000000,5922.552091,,0.000000,0.0,0.0
120356,1970-01-01 00:00:00.001738867,-0.696512,,-0.386521,-0.468673,2.0,8.19,0.0,0.0,0.0,...,,,,,-0.028486,5849.596030,,-0.028486,0.0,0.0
120357,1970-01-01 00:00:00.001738867,-0.696512,,-0.386521,-0.468673,2.0,8.19,0.0,0.0,0.0,...,,,,,0.000000,5814.577120,,0.000000,0.0,0.0
120358,1970-01-01 00:00:00.001738867,-0.696512,,-0.386521,-0.468673,2.0,8.19,0.0,0.0,0.0,...,,,,,0.000000,5794.668803,,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124134,1970-01-01 00:00:01.738868202,,🪦,,,,,,,,...,,,20000000.0,True,0.000000,5560.532010,,,0.0,
124135,1970-01-01 00:00:01.738868202,,🪦,,,,,,,,...,,,225143630.0,False,0.000000,5560.532010,,,0.0,
124130,1970-01-01 00:00:01.738868203,-0.727690,🪦,-0.348167,-0.486215,7.0,0.09,0.0,0.0,0.0,...,,,,,-0.047634,5329.815208,-60.762522,,0.0,0.0
123649,NaT,,🪦,,,,,,,,...,5.1095,0.0,,,0.000000,5329.815208,-60.762522,,0.0,


In [54]:
def label_sell_opportunity(group):
    window_size = 10
    sell_threshold = 5

    group['rolling_max'] = group['marketCap'].rolling(window=window_size, min_periods=1).max()
    drawdown = (group['rolling_max'] - group['marketCap']) / group['rolling_max'] * 100
    sell_label = (drawdown > sell_threshold).astype(int)

    group['sell_signal_confidence'] = ((group['rsi'] > 65) & (sell_label == 1)).astype(int)

    return group['sell_signal_confidence']


df['label'] = df.groupby('symbol', group_keys=False, observed=True).apply(label_sell_opportunity)
df.fillna(0, inplace=True)

features = ['marketCap_change', 'marketCap_ema', 'rsi', 'marketCap_change_ratio', 'holders_growth', 'volatility_to_volume_ratio']
X = df[features]
y = df['label']

X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())

In [55]:
df['label'].value_counts()

label
0    409848
1       121
Name: count, dtype: int64

## Improving Class Distribution in Training & Test Sets

In this section, I handle **class imbalance** using **SMOTETomek** for the training set and **downsampling** for the test set.

1. **SMOTETomek is applied to `X_train`** to **oversample the minority class (`1`)** and **remove noisy samples** from the majority class (`0`), creating a more balanced dataset for training.
2. **For the test set (`X_test`)**, I downsample class `0` to be **20× the size of class `1`**, ensuring a better-balanced evaluation while keeping real-world class ratios.

This helps the model **generalize better** and prevents it from being biased toward class `0` while still reflecting realistic market conditions as we can see in cell above


In [56]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from sklearn.utils import resample
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)

smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

print("Training Set (After SMOTETomek):")
print(y_train_resampled.value_counts())

print("\nTest Set (Before Balancing):")
print(y_test.value_counts())

test_df = X_test.copy()
test_df['label'] = y_test

# Separate majority and minority classes
test_majority = test_df[test_df['label'] == 0]
test_minority = test_df[test_df['label'] == 1]

test_majority_downsampled = resample(
    test_majority,
    replace=False,
    n_samples=len(test_minority) * 20,
    random_state=42
)

balanced_test_df = pd.concat([test_majority_downsampled, test_minority])

X_test = balanced_test_df.drop(columns=['label'])
y_test = balanced_test_df['label']

# Checking distribution
print("\nTest Set (After Balancing):")
print(y_test.value_counts())


Training Set (After SMOTETomek):
label
0    245872
1    245872
Name: count, dtype: int64

Test Set (Before Balancing):
label
0    163940
1        48
Name: count, dtype: int64

Test Set (After Balancing):
label
0    960
1     48
Name: count, dtype: int64


In [57]:
import xgboost as xgb
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    classification_report, log_loss, f1_score
)


def objective(trial):
        params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2, 8),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.5, 5),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 1500),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'max_delta_step': trial.suggest_int('max_delta_step', 2, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'device': 'gpu'
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        f1_scores = []

        X_np = X_train.to_numpy() if hasattr(X_train, "to_numpy") else X_train
        y_np = y_train.to_numpy() if hasattr(y_train, "to_numpy") else y_train

        for train_idx, val_idx in skf.split(X_np, y_np):
            X_tr, X_val = X_np[train_idx], X_np[val_idx]
            y_tr, y_val = y_np[train_idx], y_np[val_idx]

            model = xgb.XGBClassifier(**params)
            model.fit(X_tr, y_tr)

            y_pred = model.predict(X_val)
            f1_scores.append(f1_score(y_val, y_pred, average='macro'))

        return np.mean(f1_scores)

optimized_params = {
    'max_depth': 6,
    'learning_rate': 0.025,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'colsample_bylevel': 0.9,
    'reg_lambda': 1.5,
    'reg_alpha': 0.8,
    'n_estimators': 800,
    'min_child_weight': 1,
    'gamma': 0.1,
    'max_delta_step': 4,
    'grow_policy': 'depthwise',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'device': 'gpu',
    'early_stopping_rounds': 40
}


#Run Optimized Optuna Tuning
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=120)

# Get Best Hyperparameters
best_params = study.best_params
best_params['objective'] = 'binary:logistic'
best_params['eval_metric'] = 'logloss'
best_params['device'] = 'gpu'

print("Best Hyperparameters for 90%+ F1:", best_params)

#Train Final Model with Best Params
# **best_params uses params from hyperparametr tunning
final_model = xgb.XGBClassifier(**best_params)
eval_set = [(X_train, y_train), (X_test, y_test)]
final_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
)


# Make Predictions
y_pred_probs = final_model.predict_proba(X_test)[:, 1]
optimal_threshold = 0.5
y_pred = (y_pred_probs > optimal_threshold).astype(int)

print("Updated Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
logloss = log_loss(y_test, y_pred_probs)
print(f"Log Loss: {logloss}")

final_model.save_model("models/crypto_sell_model.json")

[I 2025-03-09 13:59:40,655] A new study created in memory with name: no-name-60d9b4ef-b736-4274-83be-4676144aad4b
[I 2025-03-09 13:59:51,788] Trial 0 finished with value: 0.8251541933003661 and parameters: {'max_depth': 15, 'learning_rate': 0.022359734055157274, 'subsample': 0.7997474471211821, 'colsample_bytree': 0.8751025764669702, 'colsample_bylevel': 0.9442923292497837, 'scale_pos_weight': 6.291493818888802, 'reg_lambda': 2.342010385348029, 'reg_alpha': 4.699941436279374, 'n_estimators': 1490, 'min_child_weight': 5, 'gamma': 0.2960736997751553, 'max_delta_step': 2, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.8251541933003661.
[I 2025-03-09 13:59:59,866] Trial 1 finished with value: 0.8588353848523813 and parameters: {'max_depth': 18, 'learning_rate': 0.03272501409265597, 'subsample': 0.9181812493977851, 'colsample_bytree': 0.8408372998727804, 'colsample_bylevel': 0.874142681345706, 'scale_pos_weight': 3.8109073517855214, 'reg_lambda': 2.1099159630542026, 'reg_alpha':

Best Hyperparameters for 90%+ F1: {'max_depth': 12, 'learning_rate': 0.02644320112982116, 'subsample': 0.9496006817327238, 'colsample_bytree': 0.829646468866766, 'colsample_bylevel': 0.8000822522943694, 'scale_pos_weight': 3.068331402241279, 'reg_lambda': 2.727430976856886, 'reg_alpha': 4.786267901078364, 'n_estimators': 1263, 'min_child_weight': 6, 'gamma': 0.058976314915236194, 'max_delta_step': 8, 'grow_policy': 'lossguide', 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'device': 'gpu'}
[0]	validation_0-logloss:0.12445	validation_1-logloss:0.22033
[1]	validation_0-logloss:0.12099	validation_1-logloss:0.21345
[2]	validation_0-logloss:0.11768	validation_1-logloss:0.21148
[3]	validation_0-logloss:0.11443	validation_1-logloss:0.20529
[4]	validation_0-logloss:0.11127	validation_1-logloss:0.19960
[5]	validation_0-logloss:0.10821	validation_1-logloss:0.19440
[6]	validation_0-logloss:0.10527	validation_1-logloss:0.19274
[7]	validation_0-logloss:0.10239	validation_1-logloss:0.188