In [113]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [114]:
# LOAD AND PREPROCESS THE DATA
# Load the dataset
df = pd.read_csv('Data/Polaris15arcmin.csv', sep=';')

length_df = len(df)
print("Length of the dataset: ", length_df)

Length of the dataset:  1406


In [115]:
# PREPROCESSING
df_cleaned = df.dropna()

length_df_cleaned = len(df_cleaned)
print("Length of the cleaned dataset: ", length_df_cleaned)

Length of the cleaned dataset:  1248


In [116]:
def augment_parallax_errors(df, anomaly_rate=0.1, error_factor=1.0):
    """
    Augments the parallax values with errors to simulate anomalies.
    
    Parameters:
    df (DataFrame): A DataFrame containing the original Gaia data.
    anomaly_rate (float): The fraction of total objects that should be anomalous.
    error_factor (float): A factor that determines the magnitude of the error introduced.

    Returns:
    DataFrame: A DataFrame with the augmented parallax data.
    """
    # Make a copy of the DataFrame to avoid altering the original data
    augmented_df = df.copy()
    
    # Calculate the number of objects to alter based on the anomaly rate
    num_anomalies = int(len(df) * anomaly_rate)
    
    # Select random indices for introducing anomalies
    anomaly_indices = np.random.choice(df.index, size=num_anomalies, replace=False)
    
    # Introduce errors in the parallax values
    # The error can be a random value that depends on the error_factor and the current parallax value
    augmented_df.loc[anomaly_indices, 'parallax'] += error_factor * np.random.randn(num_anomalies) * augmented_df.loc[anomaly_indices, 'parallax']
    
    # Mark the augmented objects as anomalous
    augmented_df.loc[anomaly_indices, 'correct/anomalous'] = 0  # Assuming 0 represents anomalous in your dataset
    
    return augmented_df

# Assuming you have a DataFrame `gaia_data` with your data
# augmented_data = augment_parallax_errors(gaia_data, anomaly_rate=0.1, error_factor=0.5)
# print(augmented_data.head())

In [117]:
augmented_df = augment_parallax_errors(df_cleaned, anomaly_rate=0.1, error_factor=1.0)

In [118]:
# Function to print class distribution
def print_class_distribution(y_data):
    count_class_0 = sum(y_data == 0)
    count_class_1 = sum(y_data == 1)
    total = len(y_data)
    print("Class 0: {:.2f}%, Class 1: {:.2f}%".format((count_class_0 / total) * 100, (count_class_1 / total) * 100))

In [119]:
# Splitting the augmented dataset into initial training set and unlabeled pool
initial_train_size = 150
# Randomly sample initial training data
initial_train_indices = np.random.choice(a=augmented_df.index, size=initial_train_size, replace=False)
X_initial = augmented_df.drop('correct/anomalous', axis=1).loc[initial_train_indices]
y_initial = augmented_df['correct/anomalous'].loc[initial_train_indices]
# Create the pool by excluding the initial training data
X_pool = augmented_df.drop('correct/anomalous', axis=1).drop(initial_train_indices)
y_pool = augmented_df['correct/anomalous'].drop(initial_train_indices)

In [120]:
# Initialize and train the Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_initial, y_initial)

In [121]:
# UNCERTAINTY SAMPLING
# Active learning loop (acquisition function)
# Example: Adding 350 most uncertain points from the pool to the training set
for _ in range(7):  # Run 7 iterations (50 points each iteration)
    # Predict probabilities on the unlabeled data
    probabilities = rfc.predict_proba(X_pool)

    # Calculate uncertainty and select the most uncertain data points
    uncertainty = abs(probabilities[:, 1] - 0.5)
    n_most_uncertain = 50  # Number of points to acquire in each iteration
    most_uncertain_indices = uncertainty.argsort()[:n_most_uncertain]

    # Add these points to the training set
    most_uncertain_points = X_pool.iloc[most_uncertain_indices]
    most_uncertain_labels = y_pool.iloc[most_uncertain_indices]
    X_train_extended = pd.concat([X_initial, most_uncertain_points])
    y_train_extended = pd.concat([y_initial, most_uncertain_labels])

    # Print class distribution
    print_class_distribution(y_train_extended)

    # Retrain the model on the extended training set
    rfc.fit(X_train_extended, y_train_extended)

    # Update the pool by excluding the most uncertain points
    # Use boolean indexing for this
    mask = X_pool.index.isin(most_uncertain_points.index)
    X_pool = X_pool[~mask]
    y_pool = y_pool[~mask]

Class 0: 10.00%, Class 1: 90.00%
Class 0: 13.50%, Class 1: 86.50%
Class 0: 10.50%, Class 1: 89.50%
Class 0: 11.00%, Class 1: 89.00%
Class 0: 10.50%, Class 1: 89.50%
Class 0: 13.50%, Class 1: 86.50%
Class 0: 9.50%, Class 1: 90.50%


In [123]:
# CONFINDENCE SAMPLING
# for _ in range(7):  # Run 7 iterations
#     # Predict probabilities on the unlabeled data
#     probabilities = rfc.predict_proba(X_pool)

#     # Calculate confidence and select the most confident data points about the minority class
#     confidence = probabilities[:, 0]  # Assuming class 0 is the minority class
#     n_most_confident = 50  # Number of points to acquire in each iteration

#     most_confident_indices = confidence.argsort()[-n_most_confident:]  # Select top confident indices

#     # Add these points to the training set
#     most_confident_points = X_pool.iloc[most_confident_indices]
#     most_confident_labels = y_pool.iloc[most_confident_indices]
#     X_train_extended = pd.concat([X_initial, most_confident_points])
#     y_train_extended = pd.concat([y_initial, most_confident_labels])

#     # Print class distribution
#     print_class_distribution(y_train_extended)

#     # Retrain the model on the extended training set
#     rfc.fit(X_train_extended, y_train_extended)

#     # Update the pool by excluding the most confident points
#     mask = X_pool.index.isin(most_confident_points.index)
#     X_pool = X_pool[~mask]
#     y_pool = y_pool[~mask]

In [130]:
# Loading a separate test set from another CSV file
test_df = pd.read_csv('Data/Capella5arcmin.csv', sep=';')
length_df = len(test_df)
print("Length of the dataset: ", length_df)

df_cleaned = test_df.dropna()
length_df_cleaned = len(df_cleaned)
print("Length of the cleaned dataset: ", length_df_cleaned)

column_names = df_cleaned.columns
print(column_names)

augmented_test_df = augment_parallax_errors(df_cleaned, anomaly_rate=0.1, error_factor=1.0)
X_test = augmented_test_df.drop('correct/anomalous', axis=1)
y_test = augmented_test_df['correct/anomalous']

Length of the dataset:  921
Length of the cleaned dataset:  741
Index(['correct/anomalous', 'ra', 'dec', 'parallax', 'pmra', 'pmdec',
       'astrometric_excess_noise', 'ruwe', 'phot_g_mean_mag',
       'phot_bp_rp_excess_factor', 'bp_rp'],
      dtype='object')


In [132]:
# Evaluate the model on the new test set
y_pred = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7584345479082322

Classification Report:
               precision    recall  f1-score   support

           0       0.11      0.19      0.14        74
           1       0.90      0.82      0.86       667

    accuracy                           0.76       741
   macro avg       0.50      0.51      0.50       741
weighted avg       0.82      0.76      0.79       741

