In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf

# Check if TensorFlow can detect a GPU
gpu_device_name = tf.test.gpu_device_name()

if gpu_device_name:
    print('GPU device found:', gpu_device_name)
else:
    print("No GPU available. Using CPU instead.")

No GPU available. Using CPU instead.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [4]:
import warnings
warnings.filterwarnings("ignore")

## Dataset loading, analysis and preprocessing

In [5]:
train_data= pd.read_csv("/content/drive/MyDrive/AML/A2/cf_train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/AML/A2/cf_test.csv")

In [6]:
train_data.shape, test_data.shape

((62400, 31), (62400, 31))

In [7]:
train_data.isnull().sum()

Open_n_val                        0
High_n_val                        0
Low_n_val                         0
Close_n_val                       0
Volume_n_val                      0
SMA_10_val                        0
SMA_20_val                        0
CMO_14_val                        0
High_n-Low_n_val                  0
Open_n-Close_n_val                0
SMA_20-SMA_10_val                 0
Close_n_slope_3_val               0
Close_n_slope_5_val               0
Close_n_slope_10_val              0
Open_n_changelen_val              0
High_n_changelen_val              0
Low_n_changelen_val               0
Close_n_changelen_val             0
High_n-Low_n_changelen_val        0
Open_n-Close_n_changelen_val      0
SMA_20-SMA_10_changelen_val       0
Close_n_slope_3_changelen_val     0
Close_n_slope_5_changelen_val     0
Close_n_slope_10_changelen_val    0
row_num                           0
day                               0
era                               0
target_10_val               

In [8]:
train_data.dtypes

Open_n_val                        float64
High_n_val                        float64
Low_n_val                         float64
Close_n_val                       float64
Volume_n_val                      float64
SMA_10_val                        float64
SMA_20_val                        float64
CMO_14_val                        float64
High_n-Low_n_val                  float64
Open_n-Close_n_val                float64
SMA_20-SMA_10_val                 float64
Close_n_slope_3_val               float64
Close_n_slope_5_val               float64
Close_n_slope_10_val              float64
Open_n_changelen_val              float64
High_n_changelen_val              float64
Low_n_changelen_val               float64
Close_n_changelen_val             float64
High_n-Low_n_changelen_val        float64
Open_n-Close_n_changelen_val      float64
SMA_20-SMA_10_changelen_val       float64
Close_n_slope_3_changelen_val     float64
Close_n_slope_5_changelen_val     float64
Close_n_slope_10_changelen_val    

In [9]:
# Define the mapping of values to indices
value_to_index = {0.00: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.00: 4}

# Replace values with corresponding indices
train_data["target_10_val"] = train_data["target_10_val"].replace(value_to_index).astype(int)
test_data["target_10_val"] = test_data["target_10_val"].replace(value_to_index).astype(int)

In [10]:
train_data["target_10_val"].value_counts()

3    15522
4    15497
0    15428
1    15160
2      793
Name: target_10_val, dtype: int64

In [11]:
train_data["sigma"].value_counts()

_0.01_0.05_     9100
_0.075_0_       8255
_0_0_           8190
_0.01_0_        7930
_0.05_0.05_     7475
_0.03_0_        7280
_0_0.05_        7215
_0.075_0.05_    6955
Name: sigma, dtype: int64

In [12]:
label_encoder = LabelEncoder()

train_data["sigma"] = label_encoder.fit_transform(train_data["sigma"])
test_data["sigma"] = label_encoder.fit_transform(test_data["sigma"])

In [13]:
# Initialize scaler for feature scaling
scaler = StandardScaler()

# Train the model on cf_train.csv
X_train = train_data.drop(columns=["era", "day", "target_10_val"])
y_train = train_data["target_10_val"]
X_train_scaled = scaler.fit_transform(X_train)

# Online Learning

## Linear Model

In [14]:
# Initialize the SGDClassifier
classifier = SGDClassifier(loss='log', alpha=0.01, max_iter=1000, random_state=42)

classifier.fit(X_train_scaled, y_train)

### Inference and updation of model parameters

In [15]:
# Prediction loop
predictions = []
for i, test_row in test_data.iterrows():
    if i < 10:
        predictions.append(3)  # No prediction for the first 10 rows
        continue

    # Align columns with training data
    X_test = test_row.drop(["era", "day", "target_10_val"])[X_train.columns]

    # Scale features
    X_test_scaled = scaler.transform([X_test])

    # Get the true label for the current test row
    y_true = test_row["target_10_val"]

    # Update the model with the ground truth label of current row - 10the row
    X_update = []
    y_update = []
    update_row = test_data.iloc[i - 10]
    X_update.append(update_row.drop(["era", "day", "target_10_val"])[X_train.columns])
    y_update.append(update_row["target_10_val"])

    X_update_scaled = scaler.transform(X_update)
    classifier.partial_fit(X_update_scaled, y_update, classes = [0, 1, 2, 3, 4])

    # Make prediction for the current test row
    y_pred = classifier.predict(X_test_scaled)
    predictions.append(y_pred)


In [16]:
true_labels = test_data["target_10_val"]
accuracy = accuracy_score(true_labels[10:], predictions[10:])
print("Accuracy:", accuracy)

Accuracy: 0.535085750921622


## Neural Network & LSTM

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [18]:
X_train_reshaped = X_train_scaled.reshape(X_train.shape[0], X_train.shape[1], 1)  # Reshape for LSTM input

In [19]:
# Define the LSTM model with additional hidden layers
model = Sequential([
    LSTM(units=64, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),  # LSTM layer
    Dense(32, activation='relu'),  # Additional hidden layer with 32 units and ReLU activation
    Dense(16, activation='relu'),  # Additional hidden layer with 16 units and ReLU activation
    Dense(5, activation='softmax')  # Output layer with softmax activation for multiclass classification
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [20]:
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e92401ca350>

### Inference and updation on model weights

In [22]:
predictions = []
for i, test_row in test_data.iterrows():
    if i < 10:
        predictions.append(3)  # No prediction for the first 10 rows
        continue

        # Update the model with the ground truth label of current row - 10the row
        update_row = test_data.iloc[i-10]
        X_update = scaler.transform(update_row.drop(columns=["era", "day", "target_10_val"])[X_train.columns].values.reshape(1, -1,))
        y_update = update_row["target_10_val"]
        model.fit(X_update, np.array([y_update]), epochs=1, verbose=0)

    X_test = scaler.transform(test_row.drop(columns=["era", "day", "target_10_val"])[X_train.columns].values.reshape(1, -1))
    y_pred = np.argmax(model.predict(X_test))  # Predict class with highest probability
    predictions.append(y_pred)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [14]:
true_labels = test_data["target_10_val"]
accuracy = accuracy_score(true_labels[10:], predictions[10:])
print("Accuracy:", accuracy)

Accuracy: 0.612518031735855


# **Continual ensemble learning for noisy data**

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Train the ensemble models
ensemble_models = []
for _ in range(5):  # Train 5 different models for ensemble
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    ensemble_models.append(model)

### Inference and updation of model parameters

In [17]:
# Prediction loop
predictions = []
for i, test_row in test_data[:10000].iterrows():
    if i < 10:
        predictions.append(3)  # No prediction for the first 10 rows
        continue

    # Update the model with the ground truth label of current row - 10the row
    update_row = test_data.iloc[i - 10]
    X_update = scaler.transform(update_row.drop(columns=["era", "day", "target_10_val"])[X_train.columns].values.reshape(1, -1,))
    y_update = update_row["target_10_val"]

    for model in ensemble_models:
        model.fit(X_update, [y_update])  # Update each model in the ensemble

    X_test = scaler.transform(test_row.drop(columns=["era", "day", "target_10_val"])[X_train.columns].values.reshape(1, -1,))

    # Make predictions using ensemble models
    y_pred_ensemble = np.concatenate([model.predict(X_test)[:, np.newaxis] for model in ensemble_models], axis=1)
    y_pred_ensemble = y_pred_ensemble.astype(int)  # Convert to integers
    ensemble_vote = np.argmax(np.bincount(y_pred_ensemble.flatten()))  # Flatten the array before bincount
    predictions.append(ensemble_vote)


In [1]:
true_labels = test_data["target_10_val"]
accuracy = accuracy_score(true_labels[10:10000], predictions[10:10000])
print("Accuracy:", accuracy)

Accuracy: 0.38174174174174175
