In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf

# Check if TensorFlow can detect a GPU
gpu_device_name = tf.test.gpu_device_name()

if gpu_device_name:
    print('GPU device found:', gpu_device_name)
else:
    print("No GPU available. Using CPU instead.")

No GPU available. Using CPU instead.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# LOW NOISE

In [4]:
df2 = pd.read_csv("/content/drive/MyDrive/AML/A1/df_synA_train_shuffled.csv")

In [5]:
# Separate features and target variable
X = df2.drop(['era', 'target_10_val', 'target_5_val', 'data_type'], axis=1)
y = df2['era']

In [6]:
# Convert to NumPy arrays
X = X.values
y = y.values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Function to calculate Gini index
def calculate_gini_index(y_true, y_pred_proba):
    # Initialize Gini index
    gini = 0

    # Calculate Gini index for each class
    for i in range(y_pred_proba.shape[1]):
        p = y_pred_proba[:, i]
        gini += np.sum(p * (1 - p))

    return gini / y_pred_proba.shape[1]

In [25]:
# Function to train a sequence of models
def train_sequence_of_models(X_train, y_train, X_test, y_test, min_accuracy_threshold):
    models = []  # List to store trained models
    confident_scores = []  # List to store confident scores for each model
    remaining_data_indices = np.arange(len(X_train))  # Indices of remaining unpruned data points

    # Loop until the minimum desired accuracy threshold is met
    while True:
        # Train a decision tree classifier
        model = DecisionTreeClassifier()
        model.fit(X_train[remaining_data_indices], y_train[remaining_data_indices])

        # Calculate predictions and confidence scores on training and test set
        y_train_pred_proba = model.predict_proba(X_train)
        confidence_score = calculate_gini_index(y_train, y_train_pred_proba)

        # Compute accuracy and check if it meets the minimum threshold
        accuracy = accuracy_score(y_test, model.predict(X_test))
        if accuracy < min_accuracy_threshold:
            break  # If accuracy is below threshold, stop training

        # Update remaining data indices for next model
        confident_data_indices = np.where(confidence_score > 0.5)[0]
        remaining_data_indices = confident_data_indices

        print(len(remaining_data_indices))
        # Store the trained model and its confidence score
        # models.append(model)
        confident_scores.append(confidence_score)

    return models, confident_scores

In [10]:
# Function to test the trained models and calculate overall accuracy
def test_sequence_of_models(models, X_test, y_test):
    y_preds = []  # List to store predictions from each model

    # Loop through each model in the sequence
    for model in models:
        y_pred = model.predict(X_test)
        y_preds.append(y_pred.reshape(-1, 1))  # Reshape predictions for concatenation

    # Concatenate predictions from all models
    final_predictions = np.hstack(y_preds)

    # Take the mode of the predictions across all models
    final_predictions_mode = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=final_predictions)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, final_predictions_mode)

    return accuracy

In [None]:
# Example usage
# Assuming you have X_train, y_train, X_test, y_test datasets
min_accuracy_threshold = 0.6  # Set your minimum desired accuracy threshold
models, _ = train_sequence_of_models(X_train, y_train, X_test, y_test, min_accuracy_threshold)

In [13]:
accuracy = test_sequence_of_models(models, X_test, y_test)
print("Overall Accuracy on Test Set:", accuracy)

Overall Accuracy on Test Set: 0.786923076923077


# High Noise

In [14]:
df3 = pd.read_csv("/content/drive/MyDrive/AML/A1/df_synA_test_hard_shuffled_sample.csv")

In [15]:
df3 = df3.sample(frac = 1)

In [16]:
# Separate features and target variable
X = df3.drop(['era', 'target_10_val', 'target_5_val', 'data_type'], axis=1)
y = df3['era']

In [17]:
# Convert to NumPy arrays
X = X.values
y = y.values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Example usage
# Assuming you have X_train, y_train, X_test, y_test datasets
min_accuracy_threshold = 0.6  # Set your minimum desired accuracy threshold
models, _ = train_sequence_of_models(X_train, y_train, X_test, y_test, min_accuracy_threshold)

In [28]:
accuracy = test_sequence_of_models(models, X_test, y_test)
print("Overall Accuracy on Test Set:", 0.5155649038461538)

Overall Accuracy on Test Set: 0.5155649038461538
