In [35]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [36]:
# LOAD AND PREPROCESS THE DATA
# Load the dataset
df = pd.read_csv('Data/Polaris15arcmin.csv', sep=';')

length_df = len(df)
print("Length of the dataset: ", length_df)

Length of the dataset:  165


In [37]:
# PREPROCESSING
df_cleaned = df.dropna()

length_df_cleaned = len(df_cleaned)
print("Length of the cleaned dataset: ", length_df_cleaned)

Length of the cleaned dataset:  145


In [38]:
def augment_parallax_errors(df, anomaly_rate=0.1, error_factor=1.0):
    """
    Augments the parallax values with errors to simulate anomalies.
    
    Parameters:
    df (DataFrame): A DataFrame containing the original Gaia data.
    anomaly_rate (float): The fraction of total objects that should be anomalous.
    error_factor (float): A factor that determines the magnitude of the error introduced.

    Returns:
    DataFrame: A DataFrame with the augmented parallax data.
    """
    # Make a copy of the DataFrame to avoid altering the original data
    augmented_df = df.copy()
    
    # Calculate the number of objects to alter based on the anomaly rate
    num_anomalies = int(len(df) * anomaly_rate)
    
    # Select random indices for introducing anomalies
    anomaly_indices = np.random.choice(df.index, size=num_anomalies, replace=False)
    
    # Introduce errors in the parallax values
    # The error can be a random value that depends on the error_factor and the current parallax value
    augmented_df.loc[anomaly_indices, 'parallax'] += error_factor * np.random.randn(num_anomalies) * augmented_df.loc[anomaly_indices, 'parallax']
    
    # Mark the augmented objects as anomalous
    augmented_df.loc[anomaly_indices, 'correct/anomalous'] = 0  # Assuming 0 represents anomalous in your dataset
    
    return augmented_df

# Assuming you have a DataFrame `gaia_data` with your data
# augmented_data = augment_parallax_errors(gaia_data, anomaly_rate=0.1, error_factor=0.5)
# print(augmented_data.head())

In [39]:
augmented_df = augment_parallax_errors(df_cleaned, anomaly_rate=0.1, error_factor=1.0)

In [40]:
# Split the dataset into labeled and unlabeled parts for initial training
initial_train_size = 100  # Initial small training set
initial_df = augmented_df.sample(n=initial_train_size)
unlabeled_df = augmented_df.drop(initial_df.index)

In [41]:
# Split initial training data into features and target variable
X_initial = initial_df.drop('correct/anomalous', axis=1)
y_initial = initial_df['correct/anomalous']

In [42]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_initial, y_initial, test_size=0.3, random_state=42)

In [43]:
# Initialize and train the Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [44]:
# Predict probabilities on the unlabeled data
X_unlabeled = unlabeled_df.drop('correct/anomalous', axis=1)
probabilities = rfc.predict_proba(X_unlabeled)

In [45]:
# Calculate uncertainty and select the most uncertain data points
uncertainty = abs(probabilities[:, 1] - 0.5)
n_most_uncertain = 50  # Number of points to acquire
most_uncertain_points = X_unlabeled.iloc[uncertainty.argsort()[:n_most_uncertain]]


In [46]:
# Add these points to the training set
most_uncertain_labels = unlabeled_df.loc[most_uncertain_points.index, 'correct/anomalous']
X_train_extended = pd.concat([X_train, most_uncertain_points])
y_train_extended = pd.concat([y_train, most_uncertain_labels])


In [47]:
# Retrain the model on the extended training set
rfc.fit(X_train_extended, y_train_extended)


In [48]:
# Making predictions on the test set and evaluate the model
y_pred = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8666666666666667

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.87      1.00      0.93        13

    accuracy                           0.87        15
   macro avg       0.43      0.50      0.46        15
weighted avg       0.75      0.87      0.80        15



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
