In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Load dataset
df = pd.read_csv('merged_tweets_bitcoin_1min.csv')

# Drop missing values in essential columns
required_cols = ['avg_sentiment_score', 'tweet_count', 'Open', 'High', 'Low', 'Close', 'Volume']
df = df.dropna(subset=required_cols)

# Feature Engineering
df['price_change'] = df['Close'] - df['Open']
df['rolling_mean'] = df['Close'].rolling(window=5).mean()
df['rolling_std'] = df['Close'].rolling(window=5).std()
df['volatility'] = df['rolling_std'] / df['rolling_mean']
df['sentiment_change'] = df['avg_sentiment_score'].diff()
df['Close_lag1'] = df['Close'].shift(1)
df['Close_lag3'] = df['Close'].shift(3)
df['avg_sentiment_score_lag1'] = df['avg_sentiment_score'].shift(1)
df['avg_sentiment_score_lag3'] = df['avg_sentiment_score'].shift(3)

df.dropna(inplace=True)

# Volatility target label (binary classification: high vs low volatility)
vol_threshold = df['volatility'].median()
df['vol_label'] = (df['volatility'] > vol_threshold).astype(int)

# Features
features = ['avg_sentiment_score', 'tweet_count', 'Open', 'High', 'Low', 'Close', 'Volume',
            'price_change', 'rolling_mean', 'rolling_std', 'sentiment_change',
            'Close_lag1', 'Close_lag3', 'avg_sentiment_score_lag1', 'avg_sentiment_score_lag3']

# Normalize
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# Create sequences
def create_sequences(data, labels, seq_len=10):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data[i:i+seq_len])
        y.append(labels[i+seq_len])
    return np.array(X), np.array(y)

X, y = create_sequences(df[features].values, df['vol_label'].values)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Dataset and DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(TimeSeriesDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TimeSeriesDataset(X_test, y_test), batch_size=32, shuffle=False)

# LSTM + CNN Model
class LSTM_CNN_Model(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_classes=2):
        super(LSTM_CNN_Model, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.conv1 = nn.Conv1d(in_channels=hidden_size, out_channels=32, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # (batch, seq_len, hidden)
        lstm_out = lstm_out.permute(0, 2, 1)  # (batch, hidden, seq_len)
        conv_out = self.relu(self.conv1(lstm_out))
        pooled = self.pool(conv_out).squeeze(-1)  # (batch, out_channels)
        return self.fc(pooled)

# Instantiate model
input_size = X.shape[2]
model = LSTM_CNN_Model(input_size)

# Train
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Eval
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.numpy())
            all_labels.extend(y_batch.numpy())

    from sklearn.metrics import accuracy_score
    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{num_epochs}, Accuracy: {acc:.4f}")

# Final Report
print("\nFinal Test Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['Low Vol', 'High Vol']))


Epoch 1/20, Accuracy: 0.5000
Epoch 2/20, Accuracy: 0.5000
Epoch 3/20, Accuracy: 0.6875
Epoch 4/20, Accuracy: 0.5739
Epoch 5/20, Accuracy: 0.6108
Epoch 6/20, Accuracy: 0.6562
Epoch 7/20, Accuracy: 0.6591
Epoch 8/20, Accuracy: 0.7188
Epoch 9/20, Accuracy: 0.7244
Epoch 10/20, Accuracy: 0.7159
Epoch 11/20, Accuracy: 0.7727
Epoch 12/20, Accuracy: 0.5795
Epoch 13/20, Accuracy: 0.7670
Epoch 14/20, Accuracy: 0.6989
Epoch 15/20, Accuracy: 0.7472
Epoch 16/20, Accuracy: 0.7244
Epoch 17/20, Accuracy: 0.8153
Epoch 18/20, Accuracy: 0.7642
Epoch 19/20, Accuracy: 0.7699
Epoch 20/20, Accuracy: 0.8182

Final Test Classification Report:
              precision    recall  f1-score   support

     Low Vol       0.88      0.74      0.80       176
    High Vol       0.77      0.90      0.83       176

    accuracy                           0.82       352
   macro avg       0.83      0.82      0.82       352
weighted avg       0.83      0.82      0.82       352

