In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf

# Check if TensorFlow can detect a GPU
gpu_device_name = tf.test.gpu_device_name()

if gpu_device_name:
    print('GPU device found:', gpu_device_name)
else:
    print("No GPU available. Using CPU instead.")

No GPU available. Using CPU instead.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Dataset loading, analysis and preprocessing

In [None]:
df1 = pd.read_csv("/content/drive/MyDrive/AML/A2/df_train_shuffled.csv")
df2 = pd.read_csv("/content/drive/MyDrive/AML/A2/df_val.csv")
df3 = pd.read_csv("/content/drive/MyDrive/AML/A2/df_val_test.csv")

In [None]:
df1.shape, df2.shape, df3.shape

((175127, 28), (52212, 28), (56802, 28))

In [None]:
df1.isnull().sum()

Open_n_val                        0
High_n_val                        0
Low_n_val                         0
Close_n_val                       0
Volume_n_val                      0
SMA_10_val                        0
SMA_20_val                        0
CMO_14_val                        0
High_n-Low_n_val                  0
Open_n-Close_n_val                0
SMA_20-SMA_10_val                 0
Close_n_slope_3_val               0
Close_n_slope_5_val               0
Close_n_slope_10_val              0
Open_n_changelen_val              0
High_n_changelen_val              0
Low_n_changelen_val               0
Close_n_changelen_val             0
High_n-Low_n_changelen_val        0
Open_n-Close_n_changelen_val      0
SMA_20-SMA_10_changelen_val       0
Close_n_slope_3_changelen_val     0
Close_n_slope_5_changelen_val     0
Close_n_slope_10_changelen_val    0
row_num                           0
era                               0
target_10_val                     0
target_5_val                

In [None]:
df1.dtypes

Open_n_val                        float64
High_n_val                        float64
Low_n_val                         float64
Close_n_val                       float64
Volume_n_val                      float64
SMA_10_val                        float64
SMA_20_val                        float64
CMO_14_val                        float64
High_n-Low_n_val                  float64
Open_n-Close_n_val                float64
SMA_20-SMA_10_val                 float64
Close_n_slope_3_val               float64
Close_n_slope_5_val               float64
Close_n_slope_10_val              float64
Open_n_changelen_val              float64
High_n_changelen_val              float64
Low_n_changelen_val               float64
Close_n_changelen_val             float64
High_n-Low_n_changelen_val        float64
Open_n-Close_n_changelen_val      float64
SMA_20-SMA_10_changelen_val       float64
Close_n_slope_3_changelen_val     float64
Close_n_slope_5_changelen_val     float64
Close_n_slope_10_changelen_val    

In [None]:
# Define the mapping of values to indices
value_to_index = {0.00: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.00: 4}

# Replace values with corresponding indices
df1["target_10_val"] = df1["target_10_val"].replace(value_to_index).astype(int)
df2["target_10_val"] = df2["target_10_val"].replace(value_to_index).astype(int)
df3["target_10_val"] = df3["target_10_val"].replace(value_to_index).astype(int)

In [None]:
df1["target_10_val"].value_counts()

target_10_val
2    37405
1    35965
0    35965
4    32901
3    32891
Name: count, dtype: int64

In [None]:
# Initialize scaler for feature scaling
scaler = StandardScaler()

# Train the model on cf_train.csv
X_train = df1.drop(columns=["era", "target_5_val", "target_10_val"])
X_val = df2.drop(columns=["era", "target_5_val", "target_10_val"])
X_test = df3.drop(columns=["era", "target_5_val", "target_10_val"])

y_train = df1["target_10_val"]
y_val = df2["target_10_val"]
y_test = df3["target_10_val"]

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)

# Online Learning

## Linear Model

In [None]:
# Initialize the SGDClassifier
classifier = SGDClassifier(loss='log', alpha=0.01, max_iter=1000, random_state=42)
classifier.fit(X_train_scaled, y_train)

In [None]:
# Predict on the validation data
y_val_pred = classifier.predict(X_test_scaled)

# Evaluate the performance of the classifier on the validation set
accuracy = accuracy_score(y_test, y_val_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.28287736347311715


## Inference and updation of model parameters



In [None]:
final_prediction = pd.DataFrame(columns = ['id', 'prediction', 'row_num', 'round_no'])

In [None]:
round_no = [84,85,86, 92, 93, 94, 95, 96, 97, 98, 99, 102, 103,104,105,106,107,108,109,110,111,112,113, 117, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144 ]

### Continual Learning Loop

In [None]:

for round in round_no:

    train_data = pd.read_csv("/content/drive/MyDrive/AML/A2/live_data_02-Apr-2024/df_live_train_02-Apr-2024_" + str(round) + ".csv")
    test_data = pd.read_csv("/content/drive/MyDrive/AML/A2/live_data_02-Apr-2024/df_live_test_02-Apr-2024_" + str(round) + ".csv")

    value_to_index = {0.00: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.00: 4}
    train_data["target_10_val"] = train_data["target_10_val"].replace(value_to_index).astype(int)

    y_update = train_data['target_10_val']
    X_update = train_data.drop(columns=["era", "id", "target_5", "target_10" , "target_5_val", "target_10_val"])

    # Scale features
    X_update_scaled = scaler.transform(X_update)

    classifier.partial_fit(X_update_scaled, y_update, classes = [0, 1, 2, 3, 4])

    X_test = test_data.drop(columns=["id"])
    X_test_scaled = scaler.transform(X_test)

    # Make prediction for the current test row
    y_pred = classifier.predict(X_test_scaled)

    for i in range(len(test_data)):
      final_prediction.loc[len(final_prediction)] = [ test_data['id'][i],  y_pred[i], test_data['row_num'][i], round]



In [None]:
final_prediction.head()

Unnamed: 0,id,prediction,row_num,round_no
0,GI18OEQPFG.AF,2,84,84
1,UNY.AF,2,84,84
2,ONWNW-NHGB.AF,2,84,84
3,ZPK.AF,2,84,84
4,BOPY.AF,2,70,84


In [None]:
final_prediction.shape

(112, 4)

In [None]:
final_prediction.to_csv("/content/drive/MyDrive/AML/A2/predictions_02_04_2024_1.csv", index=False)

## Neural Network & LSTM

In [None]:
round_no = [i for i in range(76, 137)]

In [None]:
round_no.remove(127)
print(round_no)

[76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136]


In [None]:
len(round_no)

60

In [None]:
final_prediction = pd.DataFrame(columns = ['id', 'prediction', 'row_num', 'round_no'])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
X_train_reshaped = X_train_scaled.reshape(X_train.shape[0], X_train.shape[1], 1)  # Reshape for LSTM input
X_val_reshaped = X_val_scaled.reshape(X_val.shape[0], X_val.shape[1], 1)  # Reshape for LSTM input
X_test_reshaped = X_test_scaled.reshape(X_test.shape[0], X_test.shape[1], 1)  # Reshape for LSTM input

In [None]:
# Define the LSTM model with additional hidden layers
model = Sequential([
    LSTM(units=64, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),  # LSTM layer
    Dense(32, activation='relu'),  # Additional hidden layer with 32 units and ReLU activation
    Dense(16, activation='relu'),  # Additional hidden layer with 16 units and ReLU activation
    Dense(5, activation='softmax')  # Output layer with softmax activation for multiclass classification
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_reshaped, y_train, validation_data=(X_val_reshaped, y_val), epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cb632bd53c0>

### Inference and updation on model weights

In [None]:

for round in round_no:

    train_data = pd.read_csv("/content/drive/MyDrive/AML/A2/live_data_28-Mar-2024/df_live_train_28-Mar-2024_" + str(round) + ".csv")
    test_data = pd.read_csv("/content/drive/MyDrive/AML/A2/live_data_28-Mar-2024/df_live_test_28-Mar-2024_" + str(round) + ".csv")

    value_to_index = {0.00: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.00: 4}
    train_data["target_10_val"] = train_data["target_10_val"].replace(value_to_index).astype(int)

    y_update = train_data['target_10_val']
    X_update = train_data.drop(columns=["era", "id", "target_5", "target_10" , "target_5_val", "target_10_val"])

    # Scale features
    X_update_scaled = scaler.transform(X_update)
    X_update_reshaped = X_update_scaled.reshape(X_update_scaled.shape[0], X_update_scaled.shape[1], 1)

    model.fit(X_update, y_update, epochs=1, verbose=0)

    X_test = test_data.drop(columns=["id"])
    X_test_scaled = scaler.transform(X_test)
    X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

    y_pred = np.argmax(model.predict(X_test_reshaped), axis = 1)  # Predict class with highest probability

    for i in range(len(test_data)):
      final_prediction.loc[len(final_prediction)] = [ test_data['id'][i],  y_pred[i], test_data['row_num'][i], round]





In [None]:
index_to_value = {v: k for k, v in value_to_index.items()}
# Replace index values in the DataFrame column with original values
final_prediction["prediction"] = final_prediction["prediction"].replace(index_to_value)

In [None]:
final_prediction

Unnamed: 0,id,prediction,row_num,round_no
0,OCPY.AF,0.5,76,76
1,ABPVY.AF,0.0,76,76
2,QPZ.AF,0.0,53,76
3,QNOHE.AF,0.5,76,76
4,ONWNWUYQAT.AF,0.5,76,76
...,...,...,...,...
203,IVCVAQ.AF,0.0,136,136
204,CRY.AF,0.0,136,136
205,GI18OEQPFG.AF,0.0,136,136
206,VPVPVONAX.AF,0.0,136,136


In [None]:
final_prediction.to_csv("/content/drive/MyDrive/AML/A2/predictions_28-03-2024.csv", index=False)

# **Continual ensemble learning for noisy data**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Train the ensemble models
ensemble_models = []
for _ in range(6):  # Train 5 different models for ensemble
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    ensemble_models.append(model)

In [None]:
from collections import Counter


predictions = []
for model in ensemble_models:
    y_pred = model.predict(X_test_scaled)
    predictions.append(y_pred)

# Apply majority voting
ensemble_predictions = []
for i in range(len(X_test)):
    # Get predictions for the i-th sample from all models
    all_predictions = [pred[i] for pred in predictions]
    # Use Counter to count occurrences of each class in predictions
    vote_counts = Counter(all_predictions)
    # Get the class with the highest count (majority voting)
    majority_class = vote_counts.most_common(1)[0][0]
    ensemble_predictions.append(majority_class)

# Convert the list to numpy array
ensemble_predictions = np.array(ensemble_predictions)

test_accuracy = accuracy_score(y_test, ensemble_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.2921023907608887


### Inference and updation of model parameters

In [None]:
# Prediction loop
predictions = []
for i, test_row in test_data[:10000].iterrows():
    if i < 10:
        predictions.append(3)  # No prediction for the first 10 rows
        continue

    # Update the model with the ground truth label of current row - 10the row
    update_row = test_data.iloc[i - 10]
    X_update = scaler.transform(update_row.drop(columns=["era", "day", "target_10_val"])[X_train.columns].values.reshape(1, -1,))
    y_update = update_row["target_10_val"]

    for model in ensemble_models:
        model.fit(X_update, [y_update])  # Update each model in the ensemble

    X_test = scaler.transform(test_row.drop(columns=["era", "day", "target_10_val"])[X_train.columns].values.reshape(1, -1,))

    # Make predictions using ensemble models
    y_pred_ensemble = np.concatenate([model.predict(X_test)[:, np.newaxis] for model in ensemble_models], axis=1)
    y_pred_ensemble = y_pred_ensemble.astype(int)  # Convert to integers
    ensemble_vote = np.argmax(np.bincount(y_pred_ensemble.flatten()))  # Flatten the array before bincount
    predictions.append(ensemble_vote)


In [None]:
true_labels = test_data["target_10_val"]
accuracy = accuracy_score(true_labels[10:10000], predictions[10:10000])
print("Accuracy:", accuracy)

Accuracy: 0.6843343343343343
