In [None]:
import tensorflow as tf

# Check if TensorFlow can detect a GPU
gpu_device_name = tf.test.gpu_device_name()

if gpu_device_name:
    print('GPU device found:', gpu_device_name)
else:
    print("No GPU available. Using CPU instead.")

GPU device found: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-0.1.10-py3-none-any.whl (156 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.6/156.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.10


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import random
from tabpfn import TabPFNClassifier

# Zero Noise

In [None]:
df = pd.read_csv("/content/drive/MyDrive/AML/A1/df_syn_train_0_0_.csv")

In [None]:
df.shape

(7800, 29)

In [None]:
df.head()

Unnamed: 0,Open_n_val,High_n_val,Low_n_val,Close_n_val,Volume_n_val,SMA_10_val,SMA_20_val,CMO_14_val,High_n-Low_n_val,Open_n-Close_n_val,...,Open_n-Close_n_changelen_val,SMA_20-SMA_10_changelen_val,Close_n_slope_3_changelen_val,Close_n_slope_5_changelen_val,Close_n_slope_10_changelen_val,row_num,day,era,target_10_val,target_5_val
0,0.5,0.5,0.5,0.5,0.0,0.5,0.25,1.0,1.0,0.5,...,0.5,0.75,0.25,0.75,0.5,75,537,2,0.75,0.75
1,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.0,1.0,0.5,...,0.5,0.5,0.75,0.5,0.5,76,537,2,0.75,0.75
2,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.0,1.0,0.5,...,0.5,0.25,0.5,0.5,0.75,77,537,2,0.75,0.75
3,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.0,1.0,0.5,...,0.25,0.25,0.5,0.75,0.5,78,537,2,0.75,0.75
4,0.5,0.5,0.5,0.5,0.0,0.5,0.5,1.0,1.0,0.5,...,0.75,0.25,0.5,0.5,0.5,79,537,2,0.75,0.75


In [None]:
# Separate features and target variable
X = df.drop(['era', 'target_10_val', 'target_5_val'], axis=1)
y = df['era']

In [None]:
y.value_counts()

2     650
4     650
5     650
7     650
9     650
10    650
12    650
15    650
16    650
18    650
19    650
21    650
Name: era, dtype: int64

In [None]:
classes1 = [2, 4, 5, 7, 9, 10, 12, 15, 16, 18, 19, 21]
random.sample(classes1, 10)

[12, 2, 10, 5, 21, 19, 4, 15, 9, 18]

In [None]:
type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(6240, 26)

In [None]:
X_test.shape

(1560, 26)

In [None]:
y_train.shape

(6240,)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [None]:
def generate_samples_for_training(X_train, y_train, classes):

  num_of_samples = 1000
  selected_indices = []
  selected_classes = random.sample(classes, 10)

  while len(selected_indices) < num_of_samples:
    # Randomly generate an index
    random_index = np.random.randint(0, len(y_train))

    # Check if the corresponding label is in the list of selected elements
    if y_train[random_index] in selected_classes:
        # If it is, append the index to the list of selected indices
        selected_indices.append(random_index)

  # Extract the selected samples from X_train and y_train
  X_train_selected = X_train[selected_indices]
  y_train_selected = y_train[selected_indices]

  return X_train_selected, y_train_selected

In [None]:
classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)

# Define the batch size for training
no_of_iterations = 20

# List to store predictions
predictions = []

# Loop over the training data in batches
for i in range(0, no_of_iterations):
    X_train_batch, y_train_batch = generate_samples_for_training(X_train, y_train,classes1 )

    # Train the classifier on the current batch
    classifier.fit(X_train_batch, y_train_batch)

    # # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # # Append the predictions to the list
    predictions.append(y_pred)



In [None]:
# Convert predictions to numpy array
predictions = np.array(predictions)

# Take the majority voting
majority_voting = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

# Calculate accuracy
accuracy = accuracy_score(y_test, majority_voting)

print('Accuracy:', accuracy)

Accuracy: 0.9115384615384615


# Low Noise

In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/AML/A1/df_synA_train_shuffled.csv")

In [None]:
# Separate features and target variable
X2 = df2.drop(['era', 'target_10_val', 'target_5_val', 'data_type'], axis=1)
y2 = df2['era']

In [None]:
y2.value_counts()

16    26000
23    26000
8     26000
1     26000
14    26000
0     26000
3     26000
7     26000
22    26000
18    26000
17    26000
15    26000
Name: era, dtype: int64

In [None]:
classes2 = [16, 23, 8, 1, 14, 0, 3, 7, 22, 18, 17, 15]
len(classes2)

12

In [None]:
df2.shape

(312000, 30)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [None]:
X_train2 = X_train2.values
X_test2 = X_test2.values
y_train2 = y_train2.values
y_test2 = y_test2.values

In [None]:
X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape

((249600, 26), (62400, 26), (249600,), (62400,))

In [None]:
classifier2 = TabPFNClassifier(device='cpu', N_ensemble_configurations=5)

# Define the batch size for training
no_of_iterations = 20

# List to store predictions
predictions = []

# Loop over the training data in batches
for i in range(0, no_of_iterations):
    X_train_batch, y_train_batch = generate_samples_for_training(X_train2, y_train2, classes2 )

    # Train the classifier on the current batch
    classifier2.fit(X_train_batch, y_train_batch)

    # # Make predictions on the test set
    y_pred = classifier2.predict(X_test2[:5000])

    # # Append the predictions to the list
    predictions.append(y_pred)



In [None]:
# Convert predictions to numpy array
predictions = np.array(predictions)

# Take the majority voting
majority_voting = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

# Calculate accuracy
accuracy = accuracy_score(y_test2[:5000], majority_voting)

print('Accuracy:', accuracy)

Accuracy: 0.679


# High Noise

In [None]:
df3 = pd.read_csv("/content/drive/MyDrive/AML/A1/df_synA_test_hard_shuffled_sample.csv")

In [None]:
df3.shape

  and should_run_async(code)


(249600, 30)

In [None]:
# Separate features and target variable
X3 = df3.drop(['era', 'target_10_val', 'target_5_val', 'data_type'], axis=1)
y3 = df3['era']

In [None]:
y3.value_counts()

  and should_run_async(code)


22    20951
23    20934
3     20904
18    20861
14    20857
1     20812
7     20805
0     20782
16    20732
8     20673
15    20645
17    20644
Name: era, dtype: int64

In [None]:
classes3 = [22, 23, 3, 18, 14, 1, 7, 0, 16, 8, 15, 17]
len(classes3)

  and should_run_async(code)


12

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=42)

In [None]:
X_train3.shape, X_test3.shape, y_train3.shape, y_test3.shape

((199680, 26), (49920, 26), (199680,), (49920,))

In [None]:
X_train3 = X_train3.values
X_test3 = X_test3.values
y_train3 = y_train3.values
y_test3 = y_test3.values

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [None]:
classifier3 = TabPFNClassifier(device='cpu', N_ensemble_configurations=5)

# Define the batch size for training
no_of_iterations = 50

# List to store predictions
predictions = []

# Loop over the training data in batches
for i in range(0, no_of_iterations):
    X_train_batch, y_train_batch = generate_samples_for_training(X_train3, y_train3, classes3 )

    # Train the classifier on the current batch
    classifier3.fit(X_train_batch, y_train_batch)

    # # Make predictions on the test set
    y_pred = classifier3.predict(X_test3[:10000])

    # # Append the predictions to the list
    predictions.append(y_pred)



In [None]:
# Convert predictions to numpy array
predictions = np.array(predictions)

# Take the majority voting
majority_voting = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

# Calculate accuracy
accuracy = accuracy_score(y_test3[:10000], majority_voting)

print('Accuracy:', accuracy)

Accuracy: 0.4584


  and should_run_async(code)
