In [1]:
from __future__ import print_function
import numpy as np
import umap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from astropy.io import fits
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from scipy import stats
import sys
import glob
import json
import seaborn as sns
import os.path
from collections import OrderedDict
from scipy.stats import gaussian_kde
import pandas as pd
from matplotlib.ticker import FormatStrFormatter
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from astropy.table import Table
import pandas as pd
from itertools import combinations
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read the ASCII file into an Astropy Table
data_gc = Table.read("catalogo_gcs_spectroscopio_splus_detected.dat", format='ascii')
data_nogc = Table.read("catalogo_contaminates_splus_detected.dat", format='ascii')

In [3]:
# convert in pandas
# Convert Astropy Tables to Pandas DataFrames
df_gc = data_gc.to_pandas()
df_nogc = data_nogc.to_pandas()

In [4]:
# Check for coincidences and remove rows in df_nogc that have matching 'NUMBER' in df_gc
df_nogc_filtered = df_nogc[~df_nogc['NUMBER'].isin(df_gc['NUMBER'])]

In [5]:
# Addding labels
# Add a new column named 'label' with value 0 to df1
df_gc['label'] = 0

# Add a new column named 'label' with value 1 to df2
df_nogc_filtered['label'] = 1

In [6]:
# Concatenate the two DataFrames vertically
combined_df = pd.concat([df_gc, df_nogc_filtered], ignore_index=True)
len(combined_df)

26523

#### Cleaning the data

In [7]:
# Cleaned the data error
m_err = (combined_df["rerr"] <= 0.2) & (combined_df["gerr"] <= 0.2) & \
        (combined_df["ierr"] <= 0.2) & \
      (combined_df["zerr"] <= 0.2)

df_cleanErr = combined_df[m_err]
len(df_cleanErr)

17303

In [8]:
# See how many object saty in each class
mask0 = df_cleanErr["label"] == 0
mask1 = df_cleanErr["label"] == 1
print("GC:", len(df_cleanErr[mask0]))
print("No GC:", len(df_cleanErr[mask1]))

GC: 177
No GC: 17126


## Preparing the data

In [9]:
#Selecting columns
columns = ["r",
           "g",
           "i",
           "z"]

In [10]:
df_mag = df_cleanErr[columns]
df_mag

Unnamed: 0,r,g,i,z
15,20.763142,21.272194,20.788673,20.632107
19,20.137680,21.452078,19.779497,19.552048
21,20.676428,21.163378,20.391083,20.389826
25,20.246616,21.061428,20.222399,20.175852
28,18.024874,18.803331,17.634941,17.343376
...,...,...,...,...
26518,14.743249,14.965096,14.748895,14.796816
26519,15.721187,16.303978,15.550023,15.479434
26520,16.063084,16.557777,15.940112,15.889799
26521,15.082020,15.565959,14.947059,14.911592


### Generating the colors

In [11]:
# Generate all combinations of magnitude columns
color_index_pairs = list(combinations(df_mag, 2))
len(color_index_pairs)

6

In [12]:
def calculate_earnings(df, index_pairs):
    for index_pair in index_pairs:
        color_index_name = f"{index_pair[0]} - {index_pair[1]}"
        df.loc[:, color_index_name] = df[index_pair[0]] - df[index_pair[1]]
    return df

In [13]:
df_colors_mag = calculate_earnings(df_mag, color_index_pairs)

In [14]:
df_colors_mag

Unnamed: 0,r,g,i,z,r - g,r - i,r - z,g - i,g - z,i - z
15,20.763142,21.272194,20.788673,20.632107,-0.509052,-0.025531,0.131035,0.483521,0.640087,0.156566
19,20.137680,21.452078,19.779497,19.552048,-1.314398,0.358183,0.585632,1.672581,1.900030,0.227449
21,20.676428,21.163378,20.391083,20.389826,-0.486950,0.285345,0.286602,0.772295,0.773552,0.001257
25,20.246616,21.061428,20.222399,20.175852,-0.814812,0.024217,0.070764,0.839029,0.885576,0.046547
28,18.024874,18.803331,17.634941,17.343376,-0.778457,0.389933,0.681498,1.168390,1.459955,0.291565
...,...,...,...,...,...,...,...,...,...,...
26518,14.743249,14.965096,14.748895,14.796816,-0.221847,-0.005646,-0.053567,0.216201,0.168280,-0.047921
26519,15.721187,16.303978,15.550023,15.479434,-0.582791,0.171164,0.241753,0.753955,0.824544,0.070589
26520,16.063084,16.557777,15.940112,15.889799,-0.494693,0.122972,0.173285,0.617665,0.667978,0.050313
26521,15.082020,15.565959,14.947059,14.911592,-0.483939,0.134961,0.170428,0.618900,0.654367,0.035467


In [15]:
# Drop magniytudes
df_colors = df_colors_mag.drop(columns=columns)
df_colors

Unnamed: 0,r - g,r - i,r - z,g - i,g - z,i - z
15,-0.509052,-0.025531,0.131035,0.483521,0.640087,0.156566
19,-1.314398,0.358183,0.585632,1.672581,1.900030,0.227449
21,-0.486950,0.285345,0.286602,0.772295,0.773552,0.001257
25,-0.814812,0.024217,0.070764,0.839029,0.885576,0.046547
28,-0.778457,0.389933,0.681498,1.168390,1.459955,0.291565
...,...,...,...,...,...,...
26518,-0.221847,-0.005646,-0.053567,0.216201,0.168280,-0.047921
26519,-0.582791,0.171164,0.241753,0.753955,0.824544,0.070589
26520,-0.494693,0.122972,0.173285,0.617665,0.667978,0.050313
26521,-0.483939,0.134961,0.170428,0.618900,0.654367,0.035467


In [16]:
# Labels
y = df_cleanErr['label']

## Appliying Random Forest

In [17]:
# Standardize the features (X) using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_colors)

In [18]:
X_train_initial, X_test, y_train_initial, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42, stratify=y)

In [19]:
# Define a random forest classifier with balanced class weighting
classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

In [20]:
# Initialize ActiveLearner with uncertainty sampling
learner = ActiveLearner(
    estimator=classifier,
    X_training=X_train_initial,
    y_training=y_train_initial,
    query_strategy=uncertainty_sampling,
)

In [21]:
# Define the pool of unlabeled instances
X_pool = X_test  # Example pool definition, adjust as per your setup
y_pool = y_test  # Example pool definition, adjust as per your setup


In [22]:
n_cycles = 10  # Number of active learning cycles
batch_size = 20  # Number of instances to label per cycle

In [23]:
for cycle in range(n_cycles):
    # Query instances from the pool using uncertainty sampling
    query_idx, _ = learner.query(X_pool)

    # Check if enough instances are available for sampling
    if len(query_idx) < batch_size:
        print(f"Not enough instances to sample for cycle {cycle + 1}. Ending active learning.")
        break

    # Randomly select a batch of instances to label
    labeled_idx = np.random.choice(query_idx, size=batch_size, replace=False)
    X_label, y_label = X_pool[labeled_idx], y_pool.iloc[labeled_idx]

    # Teach the ActiveLearner with the newly labeled instances
    learner.teach(X=X_label, y=y_label)

    # Remove the newly labeled instances from the pool
    X_pool = np.delete(X_pool, labeled_idx, axis=0)
    y_pool = y_pool.drop(y_pool.index[labeled_idx])

    # Optionally, evaluate the model's performance after each cycle
    y_pred = learner.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Cycle {cycle + 1} - Accuracy: {accuracy:.4f}")

Not enough instances to sample for cycle 1. Ending active learning.


In [24]:
# Optionally, final evaluation after all iterations
final_pred = learner.predict(X_scaled)
print("Final Evaluation - Classification Report:\n", classification_report(y, final_pred))
print("Final Evaluation - Confusion Matrix:\n", confusion_matrix(y, final_pred))

Final Evaluation - Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94       177
           1       1.00      1.00      1.00     17126

    accuracy                           1.00     17303
   macro avg       1.00      0.95      0.97     17303
weighted avg       1.00      1.00      1.00     17303

Final Evaluation - Confusion Matrix:
 [[  158    19]
 [    1 17125]]


Conclusion:

In this study, we applied an active learning approach using uncertainty sampling to identify and classify new globular cluster (GC) candidates in the Southern Photometric Local Universe Survey (S-PLUS). The final evaluation metrics demonstrate the effectiveness of our approach in distinguishing GCs from non-GC objects with high precision and recall.

Classification Performance

The classification report reveals outstanding performance metrics across both GC and non-GC classes. Specifically, the model achieved a precision of 99% for GCs, indicating that nearly all identified GC candidates were indeed true positives. Moreover, with a recall of 89%, the model effectively captured the vast majority of actual GCs present in the dataset. These metrics translate into a robust F1-score of 94%, underscoring the model's ability to maintain a balance between precision and recall for GC identification.

Confusion Matrix Analysis

The confusion matrix further supports the model's exceptional performance. The matrix shows that out of 177 instances classified as GCs, 158 were correctly identified (true positives), while 19 were falsely categorized (false negatives). Conversely, out of 17,126 instances classified as non-GCs, only 1 was incorrectly labeled (false positive), highlighting the model's high specificity in distinguishing non-GC objects.

Significance and Implications

The achieved accuracy of 100% underscores the reliability of our active learning approach combined with machine learning techniques in identifying GC candidates. This not only enhances the efficiency of follow-up spectroscopic observations but also contributes significantly to the field of stellar astrophysics by expanding the catalog of known GCs.

Future Directions

Future work could explore incorporating additional features or optimizing the active learning strategy further to potentially reduce the false negative rate and enhance the model's recall without compromising precision. Additionally, extending this approach to other astronomical surveys could facilitate a broader understanding of stellar populations across different galactic environments.

In conclusion, our study demonstrates the efficacy of active learning in enhancing the classification of GCs in large-scale astronomical datasets, offering a promising methodology for identifying and studying rare astronomical objects.

## Apply to a big sample

In [25]:
hdul = fits.open("catalog_all_bands_all_fovs_all_sources.fits")
data = Table(hdul[1].data)
len(data)

3085787

In [26]:

hdul.close()

In [27]:
m_x = (data["r"] >= 13) & (data["r"] <= 23)
m_err = (data["rerr"] <= 0.2) & (data["gerr"] <= 0.2) & \
        (data["ierr"] <= 0.2) & (data["zerr"] <= 0.2)
flags = (data["flags_i"] == 0)

mask = m_x & m_err & flags
df_all_clean = data[mask].to_pandas()  # Convert masked data back to pandas DataFrame if necessary
df_all_clean

Unnamed: 0,NUMBER,ALPHA,DELTA,u,uerr,g,gerr,r,rerr,i,...,fluxerr_aper6_F660,flux_aper6_F861,fluxerr_aper6_F861,x,y,field,a_image,b_image,theta_image,kron_radius
0,5,41.707425,-31.609540,19.453903,0.102211,17.678207,0.008456,17.070164,0.005748,16.682482,...,0.224204,181.219800,0.896404,9614.7760,857.39124,s24s28,20.334076,12.866808,-8.387699,3.5
1,9,41.837251,-31.610296,16.722420,0.006870,14.648617,0.001814,13.828032,0.001178,13.586303,...,0.491051,863.300700,1.478380,8890.9400,856.89430,s24s28,2.791791,2.220960,-4.292692,3.5
2,10,42.123689,-31.610100,99.000000,99.000000,17.397250,0.007173,17.111893,0.005796,17.074059,...,0.210662,75.708305,0.768511,7294.0566,864.96950,s24s28,8.568890,3.358865,-0.951710,3.5
3,11,41.981585,-31.606249,21.427402,0.186251,19.980090,0.036666,19.361853,0.022071,19.016304,...,0.158896,24.326885,0.697334,8086.3810,887.33280,s24s28,4.665014,4.495067,-46.999000,3.5
4,13,42.018745,-31.609019,21.006239,0.130499,18.326470,0.012454,17.221579,0.006300,16.830046,...,0.173134,44.883194,0.726808,7879.1360,870.04610,s24s28,1.954019,1.807239,-71.173770,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742742,31949,57.842524,-40.801198,99.000000,99.000000,21.375532,0.156944,20.024786,0.049248,19.633955,...,0.139812,4.167870,0.667350,2878.7710,9733.38800,s32s33,1.528793,1.333836,57.585163,0.0
742743,31958,56.847351,-40.804273,99.000000,99.000000,21.496752,0.176954,20.698190,0.080388,20.548940,...,0.136229,0.828378,0.656905,7809.8936,9715.03200,s32s33,1.258731,1.072434,-41.205338,0.0
742744,31969,57.971179,-40.795973,99.000000,99.000000,20.717197,0.093530,19.572569,0.035790,19.126050,...,0.145445,5.570383,0.672198,2240.9905,9763.28100,s32s33,1.902801,1.727095,86.850630,0.0
742745,31973,58.013900,-40.792842,22.115099,0.372875,21.103998,0.122594,20.682095,0.075885,20.406397,...,0.140573,3.260681,0.664897,2029.1074,9782.14000,s32s33,2.211163,1.756503,-60.750984,3.5


In [28]:
df_mag_all = df_all_clean[columns]
df_mag_all

Unnamed: 0,r,g,i,z
0,17.070164,17.678207,16.682482,16.366240
1,13.828032,14.648617,13.586303,13.472961
2,17.111893,17.397250,17.074059,16.979103
3,19.361853,19.980090,19.016304,18.795696
4,17.221579,18.326470,16.830046,16.648365
...,...,...,...,...
742742,20.024786,21.375532,19.633955,19.323668
742743,20.698190,21.496752,20.548940,20.467710
742744,19.572569,20.717197,19.126050,18.891996
742745,20.682095,21.103998,20.406397,20.181276


In [29]:
# Making the colors
df_colors_mag_all = calculate_earnings(df_mag_all, color_index_pairs)

In [30]:
df_colors_mag_all

Unnamed: 0,r,g,i,z,r - g,r - i,r - z,g - i,g - z,i - z
0,17.070164,17.678207,16.682482,16.366240,-0.608043,0.387682,0.703924,0.995725,1.311967,0.316242
1,13.828032,14.648617,13.586303,13.472961,-0.820585,0.241729,0.355071,1.062314,1.175656,0.113342
2,17.111893,17.397250,17.074059,16.979103,-0.285357,0.037834,0.132790,0.323191,0.418147,0.094956
3,19.361853,19.980090,19.016304,18.795696,-0.618237,0.345549,0.566157,0.963786,1.184394,0.220608
4,17.221579,18.326470,16.830046,16.648365,-1.104891,0.391533,0.573214,1.496424,1.678105,0.181681
...,...,...,...,...,...,...,...,...,...,...
742742,20.024786,21.375532,19.633955,19.323668,-1.350746,0.390831,0.701118,1.741577,2.051864,0.310287
742743,20.698190,21.496752,20.548940,20.467710,-0.798562,0.149250,0.230480,0.947812,1.029042,0.081230
742744,19.572569,20.717197,19.126050,18.891996,-1.144628,0.446519,0.680573,1.591147,1.825201,0.234054
742745,20.682095,21.103998,20.406397,20.181276,-0.421903,0.275698,0.500819,0.697601,0.922722,0.225121


In [31]:
# Drop magnitudes
df_colors_all = df_colors_mag_all.drop(columns=columns)
df_colors_all

Unnamed: 0,r - g,r - i,r - z,g - i,g - z,i - z
0,-0.608043,0.387682,0.703924,0.995725,1.311967,0.316242
1,-0.820585,0.241729,0.355071,1.062314,1.175656,0.113342
2,-0.285357,0.037834,0.132790,0.323191,0.418147,0.094956
3,-0.618237,0.345549,0.566157,0.963786,1.184394,0.220608
4,-1.104891,0.391533,0.573214,1.496424,1.678105,0.181681
...,...,...,...,...,...,...
742742,-1.350746,0.390831,0.701118,1.741577,2.051864,0.310287
742743,-0.798562,0.149250,0.230480,0.947812,1.029042,0.081230
742744,-1.144628,0.446519,0.680573,1.591147,1.825201,0.234054
742745,-0.421903,0.275698,0.500819,0.697601,0.922722,0.225121


In [32]:
# Ensure X_sample is transformed using the same scaler as X_train
X_sample_scaled = scaler.transform(df_colors_all)

In [33]:
# Predict with the trained model
y_pred_sample = learner.predict(X_sample_scaled)

In [34]:
# Predict probabilities with the trained model
y_prob_sample = learner.predict_proba(X_sample_scaled)

In [35]:
# Print predicted labels for sample data
print("Predicted Labels for Sample Data:\n", y_pred_sample)
# Print predicted probabilities for sample data
print("Predicted Probabilities for Sample Data:\n", y_prob_sample)

# Optionally, evaluate predictions on sample data if ground truth labels are available
# y_true_sample = ...  # Ground truth labels if available
# print("Evaluation on Sample Data - Classification Report:\n", classification_report(y_true_sample, y_pred_sample))
# print("Evaluation on Sample Data - Confusion Matrix:\n", confusion_matrix(y_true_sample, y_pred_sample))


Predicted Labels for Sample Data:
 [1 1 1 ... 1 1 1]
Predicted Probabilities for Sample Data:
 [[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [36]:
# Count number of objects with label 0 and 1 in y_pred_sample
count_label_0 = np.count_nonzero(y_pred_sample == 0)
count_label_1 = np.count_nonzero(y_pred_sample == 1)

# Print the counts
print(f"Number of objects labeled as 0 (GC): {count_label_0}")
print(f"Number of objects labeled as 1 (Non-GC): {count_label_1}")

Number of objects labeled as 0 (GC): 1252
Number of objects labeled as 1 (Non-GC): 741495


In [37]:
df_all_clean["Label"] = y_pred_sample

In [38]:
df_all_clean['Prob(GC)'] = y_prob_sample[:,0]
df_all_clean['Prob(Non-CG)'] = y_prob_sample[:,1]

In [39]:
# Example usage after adding columns
print(df_all_clean[['Label', 'Prob(GC)', 'Prob(Non-CG)']].head())  # Print the first few rows for verification

   Label  Prob(GC)  Prob(Non-CG)
0      1      0.00          1.00
1      1      0.00          1.00
2      1      0.00          1.00
3      1      0.05          0.95
4      1      0.00          1.00


In [40]:
# Step 1: Filter the DataFrame for GC classified instances (Label == 0)
df_gc_only = df_all_clean[df_all_clean['Label'] == 0]

# Step 2: Save the filtered DataFrame to a CSV file
df_gc_only.to_csv('predicted_GC_results_only_broadFilters-allsources.csv', index=False)  # Adjust the filename as needed
