In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import glob
import matplotlib.pyplot as plt

In [2]:
# LOAD AND PREPROCESS THE DATA
# Load the dataset

file_paths = glob.glob('Data/CSV/*.csv')

# Initialize an empty list to store DataFrames
dfs = []

# Iterate through the list of file paths and read each CSV into a DataFrame
for file_path in file_paths:
    df = pd.read_csv(file_path, sep=';')
    dfs.append(df)

# Concatenate all the DataFrames into one
concatenated_df = pd.concat(dfs, ignore_index=True) 

print(concatenated_df)

length_df = len(concatenated_df)
print("Length of the dataset: ", length_df)

Length of the dataset:  1406


In [3]:
df_cleaned = df.dropna()

length_df_cleaned = len(df_cleaned)
print("Length of the cleaned dataset: ", length_df_cleaned)

Length of the cleaned dataset:  1248


In [4]:
def augment_parallax_errors(df, anomaly_rate=0.1, error_factor=1.0):
    """
    Augments the parallax values with errors to simulate anomalies.
    
    Parameters:
    df (DataFrame): A DataFrame containing the original Gaia data.
    anomaly_rate (float): The fraction of total objects that should be anomalous.
    error_factor (float): A factor that determines the magnitude of the error introduced.

    Returns:
    DataFrame: A DataFrame with the augmented parallax data.
    """
    # Make a copy of the DataFrame to avoid altering the original data
    augmented_df = df.copy()
    
    # Calculate the number of objects to alter based on the anomaly rate
    num_anomalies = int(len(df) * anomaly_rate)
    
    # Select random indices for introducing anomalies
    anomaly_indices = np.random.choice(df.index, size=num_anomalies, replace=False)
    
    # Introduce errors in the parallax values
    # The error can be a random value that depends on the error_factor and the current parallax value
    augmented_df.loc[anomaly_indices, 'parallax'] += error_factor * np.random.randn(num_anomalies) * augmented_df.loc[anomaly_indices, 'parallax']
    
    # Mark the augmented objects as anomalous
    augmented_df.loc[anomaly_indices, 'correct/anomalous'] = 0  # Assuming 0 represents anomalous in your dataset
    
    return augmented_df

# Assuming you have a DataFrame `gaia_data` with your data
# augmented_data = augment_parallax_errors(gaia_data, anomaly_rate=0.1, error_factor=0.5)
# print(augmented_data.head())

In [5]:
augmented_df = augment_parallax_errors(df_cleaned, anomaly_rate=0.1, error_factor=1.0)

In [6]:
sample_size = 500  # Adjust as needed
train_sample = augmented_df.sample(n=sample_size, random_state=42)

length_sampled_df = len(train_sample)
print("Length of the sampled dataset: ", length_sampled_df)

Length of the sampled dataset:  500


In [7]:
column_names = df.columns
print(column_names)

Index(['correct/anomalous', 'ra', 'dec', 'parallax', 'pmra', 'pmdec',
       'astrometric_excess_noise', 'ruwe', 'phot_g_mean_mag',
       'phot_bp_rp_excess_factor', 'bp_rp'],
      dtype='object')


In [8]:
# Splitting the training sample into features and target
X_train = train_sample.drop('correct/anomalous', axis=1)
y_train = train_sample['correct/anomalous']

In [9]:
# Display class distribution in the training set
print("Class distribution in training set:\n", y_train.value_counts())

Class distribution in training set:
 correct/anomalous
1    456
0     44
Name: count, dtype: int64


In [10]:
# Initialize the Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [11]:
# Loading the test set from another CSV file
test_df = pd.read_csv('Data/Capella5arcmin.csv', sep=';')
test_df_cleaned = test_df.dropna()


In [12]:
# Assuming the test set also needs augmentation
augmented_test_df = augment_parallax_errors(test_df_cleaned, anomaly_rate=0.5, error_factor=1.0)

In [13]:
# Splitting the test set into features and target
X_test = augmented_test_df.drop('correct/anomalous', axis=1)
y_test = augmented_test_df['correct/anomalous']

In [14]:
# Evaluate the model on the test set
y_pred = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.4993252361673414

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       370
           1       0.50      1.00      0.67       371

    accuracy                           0.50       741
   macro avg       0.25      0.50      0.33       741
weighted avg       0.25      0.50      0.33       741

