In [4]:
from __future__ import print_function
import numpy as np
import umap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from astropy.io import fits
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from scipy import stats
import sys
import glob
import json
import seaborn as sns
import os.path
from collections import OrderedDict
from scipy.stats import gaussian_kde
import pandas as pd
from matplotlib.ticker import FormatStrFormatter
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from astropy.table import Table
import pandas as pd
from itertools import combinations
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Read the ASCII file into an Astropy Table
data_gc = Table.read("catalogo_gcs_spectroscopio_splus_detected.dat", format='ascii')
data_nogc = Table.read("catalogo_contaminates_splus_detected.dat", format='ascii')

In [7]:
# convert in pandas
# Convert Astropy Tables to Pandas DataFrames
df_gc = data_gc.to_pandas()
df_nogc = data_nogc.to_pandas()

In [8]:
# Check for coincidences and remove rows in df_nogc that have matching 'NUMBER' in df_gc
df_nogc_filtered = df_nogc[~df_nogc['NUMBER'].isin(df_gc['NUMBER'])]

In [9]:
# Addding labels
# Add a new column named 'label' with value 0 to df1
df_gc['label'] = 0

# Add a new column named 'label' with value 1 to df2
df_nogc_filtered['label'] = 1

In [10]:
# Concatenate the two DataFrames vertically
combined_df = pd.concat([df_gc, df_nogc_filtered], ignore_index=True)
len(combined_df)

26523

#### Cleaning the data

In [11]:
# Cleaned the data error
m_err = (combined_df["rerr"] <= 0.3) & (combined_df["gerr"] <= 0.3) & \
        (combined_df["ierr"] <= 0.3) & \
         (combined_df["F430err"] <= 0.3) & \
        (combined_df["F515err"] <= 0.3) & (combined_df["F660err"] <= 0.3) & \
        (combined_df["F861err"] <= 0.3) & (combined_df["zerr"] <= 0.3)



df_cleanErr = combined_df[m_err]
len(df_cleanErr)

16659

In [12]:
# See how many object saty in each class
mask0 = df_cleanErr["label"] == 0
mask1 = df_cleanErr["label"] == 1
print("GC:", len(df_cleanErr[mask0]))
print("No GC:", len(df_cleanErr[mask1]))

GC: 73
No GC: 16586


## Preparing the data

In [13]:
#Selecting columns
columns = ["r",
"g",
"i",
"z",
"F430",
"F515",
"F660",
"F861"]

In [14]:
df_mag = df_cleanErr[columns]
df_mag

Unnamed: 0,r,g,i,z,F430,F515,F660,F861
18,20.771109,21.634192,20.403067,20.642290,21.397750,21.053720,20.602430,20.654783
19,20.137680,21.452078,19.779497,19.552048,21.201336,20.714169,20.130114,19.531912
25,20.246616,21.061428,20.222399,20.175852,20.992937,20.684935,20.479723,19.814960
28,18.024874,18.803331,17.634941,17.343376,19.499080,18.669243,17.865160,17.431444
38,19.519290,20.294525,19.167469,19.019533,20.706673,20.065145,19.382828,19.034452
...,...,...,...,...,...,...,...,...
26518,14.743249,14.965096,14.748895,14.796816,15.080612,14.838267,14.801393,14.790771
26519,15.721187,16.303978,15.550023,15.479434,16.811510,16.123583,15.673627,15.517522
26520,16.063084,16.557777,15.940112,15.889799,16.958448,16.393812,16.033278,15.924335
26521,15.082020,15.565959,14.947059,14.911592,15.949590,15.412489,15.070141,14.933365


### Generating the colors

In [15]:
# Generate all combinations of magnitude columns
color_index_pairs = list(combinations(df_mag, 2))
len(color_index_pairs)

28

In [16]:
def calculate_earnings(df, index_pairs):
    for index_pair in index_pairs:
        color_index_name = f"{index_pair[0]} - {index_pair[1]}"
        df.loc[:, color_index_name] = df[index_pair[0]] - df[index_pair[1]]
    return df

In [17]:
df_colors_mag = calculate_earnings(df_mag, color_index_pairs)

In [18]:
df_colors_mag

Unnamed: 0,r,g,i,z,F430,F515,F660,F861,r - g,r - i,...,z - F430,z - F515,z - F660,z - F861,F430 - F515,F430 - F660,F430 - F861,F515 - F660,F515 - F861,F660 - F861
18,20.771109,21.634192,20.403067,20.642290,21.397750,21.053720,20.602430,20.654783,-0.863083,0.368042,...,-0.755460,-0.411430,0.039860,-0.012493,0.344030,0.795320,0.742967,0.451290,0.398937,-0.052353
19,20.137680,21.452078,19.779497,19.552048,21.201336,20.714169,20.130114,19.531912,-1.314398,0.358183,...,-1.649288,-1.162121,-0.578066,0.020136,0.487167,1.071222,1.669424,0.584055,1.182257,0.598202
25,20.246616,21.061428,20.222399,20.175852,20.992937,20.684935,20.479723,19.814960,-0.814812,0.024217,...,-0.817085,-0.509083,-0.303871,0.360892,0.308002,0.513214,1.177977,0.205212,0.869975,0.664763
28,18.024874,18.803331,17.634941,17.343376,19.499080,18.669243,17.865160,17.431444,-0.778457,0.389933,...,-2.155704,-1.325867,-0.521784,-0.088068,0.829837,1.633920,2.067636,0.804083,1.237799,0.433716
38,19.519290,20.294525,19.167469,19.019533,20.706673,20.065145,19.382828,19.034452,-0.775235,0.351821,...,-1.687140,-1.045612,-0.363295,-0.014919,0.641528,1.323845,1.672221,0.682317,1.030693,0.348376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26518,14.743249,14.965096,14.748895,14.796816,15.080612,14.838267,14.801393,14.790771,-0.221847,-0.005646,...,-0.283796,-0.041451,-0.004577,0.006044,0.242345,0.279219,0.289841,0.036874,0.047496,0.010622
26519,15.721187,16.303978,15.550023,15.479434,16.811510,16.123583,15.673627,15.517522,-0.582791,0.171164,...,-1.332076,-0.644149,-0.194193,-0.038088,0.687927,1.137883,1.293988,0.449956,0.606061,0.156105
26520,16.063084,16.557777,15.940112,15.889799,16.958448,16.393812,16.033278,15.924335,-0.494693,0.122972,...,-1.068649,-0.504013,-0.143479,-0.034536,0.564636,0.925170,1.034113,0.360534,0.469477,0.108943
26521,15.082020,15.565959,14.947059,14.911592,15.949590,15.412489,15.070141,14.933365,-0.483939,0.134961,...,-1.037998,-0.500897,-0.158549,-0.021773,0.537101,0.879449,1.016225,0.342348,0.479124,0.136776


In [19]:
# Drop magniytudes
df_colors = df_colors_mag.drop(columns=columns)
df_colors

Unnamed: 0,r - g,r - i,r - z,r - F430,r - F515,r - F660,r - F861,g - i,g - z,g - F430,...,z - F430,z - F515,z - F660,z - F861,F430 - F515,F430 - F660,F430 - F861,F515 - F660,F515 - F861,F660 - F861
18,-0.863083,0.368042,0.128819,-0.626641,-0.282611,0.168679,0.116326,1.231125,0.991902,0.236442,...,-0.755460,-0.411430,0.039860,-0.012493,0.344030,0.795320,0.742967,0.451290,0.398937,-0.052353
19,-1.314398,0.358183,0.585632,-1.063656,-0.576489,0.007566,0.605768,1.672581,1.900030,0.250742,...,-1.649288,-1.162121,-0.578066,0.020136,0.487167,1.071222,1.669424,0.584055,1.182257,0.598202
25,-0.814812,0.024217,0.070764,-0.746321,-0.438319,-0.233107,0.431656,0.839029,0.885576,0.068491,...,-0.817085,-0.509083,-0.303871,0.360892,0.308002,0.513214,1.177977,0.205212,0.869975,0.664763
28,-0.778457,0.389933,0.681498,-1.474206,-0.644369,0.159714,0.593430,1.168390,1.459955,-0.695749,...,-2.155704,-1.325867,-0.521784,-0.088068,0.829837,1.633920,2.067636,0.804083,1.237799,0.433716
38,-0.775235,0.351821,0.499757,-1.187383,-0.545855,0.136462,0.484838,1.127056,1.274992,-0.412148,...,-1.687140,-1.045612,-0.363295,-0.014919,0.641528,1.323845,1.672221,0.682317,1.030693,0.348376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26518,-0.221847,-0.005646,-0.053567,-0.337363,-0.095018,-0.058144,-0.047522,0.216201,0.168280,-0.115516,...,-0.283796,-0.041451,-0.004577,0.006044,0.242345,0.279219,0.289841,0.036874,0.047496,0.010622
26519,-0.582791,0.171164,0.241753,-1.090323,-0.402396,0.047560,0.203665,0.753955,0.824544,-0.507532,...,-1.332076,-0.644149,-0.194193,-0.038088,0.687927,1.137883,1.293988,0.449956,0.606061,0.156105
26520,-0.494693,0.122972,0.173285,-0.895364,-0.330728,0.029806,0.138749,0.617665,0.667978,-0.400671,...,-1.068649,-0.504013,-0.143479,-0.034536,0.564636,0.925170,1.034113,0.360534,0.469477,0.108943
26521,-0.483939,0.134961,0.170428,-0.867570,-0.330469,0.011879,0.148655,0.618900,0.654367,-0.383631,...,-1.037998,-0.500897,-0.158549,-0.021773,0.537101,0.879449,1.016225,0.342348,0.479124,0.136776


In [20]:
# Labels
y = df_cleanErr['label']

## Appliying Random Forest

In [21]:
# Standardize the features (X) using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_colors)

In [22]:
# Increase the size of initial training set relative to X_pool
X_train, X_pool, y_train, y_pool = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

In [23]:
# Initialize a random forest classifier with class weighting
classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

In [24]:
# Initialize ActiveLearner with uncertainty sampling
learner = ActiveLearner(
    estimator=classifier,
    X_training=X_train, y_training=y_train,
    query_strategy=uncertainty_sampling,
)

In [25]:
# Define the number of iterations for active learning
iterations = 10  # Adjust as needed

In [26]:
# Active learning loop
for iteration in range(iterations):
    # Query the instances from the pool to be labeled using uncertainty_sampling
    query_idx, query_instance = learner.query(X_pool)
    
    # Check if there are enough instances to sample
    if len(query_idx) < 5:  # Adjust the batch size as needed
        print(f"Not enough instances to sample for iteration {iteration + 1}. Ending active learning.")
        break
    
    # Simulate or perform manual labeling (replace with actual labeling process)
    labeled_idx = np.random.choice(query_idx, size=min(5, len(query_idx)), replace=False)
    X_label, y_label = X_pool[labeled_idx], y_pool[labeled_idx]

    # Teach the ActiveLearner with the newly labeled instances
    learner.teach(X=X_label, y=y_label)

    # Remove the newly labeled instances from the pool
    X_pool = np.delete(X_pool, labeled_idx, axis=0)
    y_pool = np.delete(y_pool, labeled_idx, axis=0)

    # Optionally, evaluate the model's performance after each iteration
    y_pred = learner.predict(X_scaled)  # Predict on entire dataset for evaluation
    print(f"Iteration {iteration + 1} - Classification Report:\n", classification_report(y, y_pred))
    print(f"Iteration {iteration + 1} - Confusion Matrix:\n", confusion_matrix(y, y_pred))

Not enough instances to sample for iteration 1. Ending active learning.


In [27]:
# Optionally, final evaluation after all iterations
final_pred = learner.predict(X_scaled)
print("Final Evaluation - Classification Report:\n", classification_report(y, final_pred))
print("Final Evaluation - Confusion Matrix:\n", confusion_matrix(y, final_pred))

Final Evaluation - Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95        73
           1       1.00      1.00      1.00     16586

    accuracy                           1.00     16659
   macro avg       1.00      0.95      0.97     16659
weighted avg       1.00      1.00      1.00     16659

Final Evaluation - Confusion Matrix:
 [[   66     7]
 [    0 16586]]


Conclusion:

The classification model achieved outstanding performance in identifying Globular Clusters (GC) within the dataset. With an overall accuracy of 100%, the model demonstrated robust capability in distinguishing GC from Non-GC objects. Specifically:

    Precision and Recall:
        For GC (Class 0), the model achieved a precision of 100%, indicating that all objects classified as GC were indeed GC. The recall of 90% suggests that 90% of all actual GC objects were correctly identified by the model.
        For Non-GC (Class 1), the precision and recall were both 100%, indicating perfect identification of Non-GC objects.

    F1-Score:
        The weighted average F1-score of 1.00 indicates excellent balance between precision and recall across both classes, reflecting high model performance.

    Confusion Matrix:
        The confusion matrix illustrates that out of the 73 GC objects, 66 were correctly classified, with 7 misclassifications as Non-GC. Importantly, there were no False Negatives (GC misclassified as Non-GC), and a large majority of the Non-GC objects (16,586 out of 16,593) were correctly identified.

Overall, these results indicate that the model is highly effective in identifying GC objects based on the features used for training. Further validation and testing on independent datasets would confirm the generalizability and reliability of the model in practical astronomical applications.

## Apply to a big sample

In [None]:
# Example new sample data (X_sample) - Make sure X_sample is preprocessed similarly to X_train
data = Table.read("catalog_all_bands_all_fovs_all_sources_106_rband.dat", format='ascii')

In [136]:
df_all = data.to_pandas()

In [98]:
df_all

Unnamed: 0,NUMBER,ALPHA,DELTA,u,uerr,g,gerr,r,rerr,i,...,fwhm_r,fwhm_psf_r,ellog_r,ellip_r,class_r,spread_r,flux_radius_r,area_r,mumax_r,kron_radius_r
0,5,41.707425,-31.609540,19.453903,0.102211,17.678207,0.008456,17.070164,0.005748,16.682482,...,28.233664,1.35,1.580351,0.367229,0.028626,0.024721,10.207849,1979,18.582102,3.500000
1,6,42.169686,-31.536274,20.278736,0.066860,18.145863,0.010459,17.325224,0.006315,16.626760,...,35.868310,1.35,5.854529,0.829192,0.028549,0.016807,15.456670,2995,19.180944,3.500000
2,9,41.837251,-31.610296,16.722420,0.006870,14.648617,0.001814,13.828032,0.001178,13.586303,...,3.187606,1.35,1.257020,0.204468,0.999676,0.000744,1.706050,534,15.280809,3.500000
3,10,42.123689,-31.610100,99.000000,99.000000,17.397250,0.007173,17.111893,0.005796,17.074059,...,15.544360,1.35,2.551127,0.608016,0.028643,0.030338,4.576429,684,18.362354,3.500000
4,11,41.981585,-31.606249,21.427402,0.186251,19.980090,0.036666,19.361853,0.022071,19.016304,...,11.915299,1.35,1.037807,0.036430,0.000408,0.025243,6.217256,362,21.232092,3.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749399,31975,58.172880,-40.790009,21.073397,0.153950,19.439800,0.038556,18.584753,0.019846,18.299133,...,2.829213,1.46,1.049959,0.047582,0.999702,0.000744,1.639664,53,19.885939,3.500000
1749400,31976,56.880399,-40.744566,99.000000,99.000000,21.011417,0.114948,20.388622,0.061590,20.034092,...,14.759645,1.46,1.708737,0.414772,0.000335,0.019123,9.469728,76,22.024738,6.373395
1749401,31979,57.352097,-40.795468,99.000000,99.000000,99.000000,99.000000,21.529171,0.152961,21.147512,...,8.299409,1.46,1.246947,0.198041,0.008161,0.009048,2.399784,17,22.620626,3.500000
1749402,31980,57.672448,-40.797118,99.000000,99.000000,99.000000,99.000000,22.054295,0.249689,21.836752,...,5.348808,1.46,1.512234,0.338726,0.478432,-0.008490,1.922813,4,22.879822,5.111665


In [137]:
# Cleaned the data
m_x =  (df_all["r"] >= 13) & (df_all["r"] <= 23)
m_err = (df_all["rerr"] <= 0.3) & (df_all["gerr"] <= 0.3) & \
        (df_all["ierr"] <= 0.3) & \
        (df_all["F430err"] <= 0.3) & \
        (df_all["F515err"] <= 0.3) &  (df_all["F660err"] <= 0.3) & \
        (df_all["F861err"] <= 0.3) &  (df_all["zerr"] <= 0.3)

flags = (df_all["flags_r"] == 0) & (df_all["flags_i"] == 0)

mask = m_x & m_err & flags

df_all_clean = df_all[mask]
len(df_all_clean)

327079

In [138]:
df_mag_all = df_all_clean[columns]
df_mag_all

Unnamed: 0,r,g,i,z,F430,F515,F660,F861
0,17.070164,17.678207,16.682482,16.366240,16.828392,17.974620,17.164143,16.585669
2,13.828032,14.648617,13.586303,13.472961,15.245046,14.608890,13.802773,13.481914
3,17.111893,17.397250,17.074059,16.979103,16.385063,17.409624,17.408487,17.508982
4,19.361853,19.980090,19.016304,18.795696,20.733458,19.546267,19.256630,18.891642
5,17.221579,18.326470,16.830046,16.648365,19.242409,18.357973,17.074144,16.697836
...,...,...,...,...,...,...,...,...
1749373,18.763690,19.661716,18.455479,18.277122,20.111898,19.648975,18.638176,18.348700
1749374,18.142593,18.686644,17.960854,17.891920,19.070671,18.506151,18.122720,17.877625
1749375,18.275616,19.581072,17.622740,17.352358,20.124254,19.617146,18.030436,17.426100
1749382,14.789453,15.418627,14.624699,14.555744,15.922544,15.268383,14.743403,14.579458


In [139]:
# Making the colors
df_colors_mag_all = calculate_earnings(df_mag_all, color_index_pairs)

In [140]:
df_colors_mag_all

Unnamed: 0,r,g,i,z,F430,F515,F660,F861,r - g,r - i,...,z - F430,z - F515,z - F660,z - F861,F430 - F515,F430 - F660,F430 - F861,F515 - F660,F515 - F861,F660 - F861
0,17.070164,17.678207,16.682482,16.366240,16.828392,17.974620,17.164143,16.585669,-0.608043,0.387682,...,-0.462152,-1.608380,-0.797903,-0.219429,-1.146228,-0.335751,0.242723,0.810477,1.388951,0.578474
2,13.828032,14.648617,13.586303,13.472961,15.245046,14.608890,13.802773,13.481914,-0.820585,0.241729,...,-1.772085,-1.135929,-0.329812,-0.008953,0.636156,1.442273,1.763132,0.806117,1.126976,0.320859
3,17.111893,17.397250,17.074059,16.979103,16.385063,17.409624,17.408487,17.508982,-0.285357,0.037834,...,0.594040,-0.430521,-0.429384,-0.529879,-1.024561,-1.023424,-1.123919,0.001137,-0.099358,-0.100495
4,19.361853,19.980090,19.016304,18.795696,20.733458,19.546267,19.256630,18.891642,-0.618237,0.345549,...,-1.937762,-0.750571,-0.460934,-0.095946,1.187191,1.476828,1.841816,0.289637,0.654625,0.364988
5,17.221579,18.326470,16.830046,16.648365,19.242409,18.357973,17.074144,16.697836,-1.104891,0.391533,...,-2.594044,-1.709608,-0.425779,-0.049471,0.884436,2.168265,2.544573,1.283829,1.660137,0.376308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749373,18.763690,19.661716,18.455479,18.277122,20.111898,19.648975,18.638176,18.348700,-0.898026,0.308211,...,-1.834776,-1.371853,-0.361054,-0.071578,0.462923,1.473722,1.763198,1.010799,1.300275,0.289476
1749374,18.142593,18.686644,17.960854,17.891920,19.070671,18.506151,18.122720,17.877625,-0.544051,0.181739,...,-1.178751,-0.614231,-0.230800,0.014295,0.564520,0.947951,1.193046,0.383431,0.628526,0.245095
1749375,18.275616,19.581072,17.622740,17.352358,20.124254,19.617146,18.030436,17.426100,-1.305456,0.652876,...,-2.771896,-2.264788,-0.678078,-0.073742,0.507108,2.093818,2.698154,1.586710,2.191046,0.604336
1749382,14.789453,15.418627,14.624699,14.555744,15.922544,15.268383,14.743403,14.579458,-0.629174,0.164754,...,-1.366800,-0.712639,-0.187658,-0.023714,0.654161,1.179142,1.343086,0.524980,0.688925,0.163944


In [141]:
# Drop magnitudes
df_colors_all = df_colors_mag_all.drop(columns=columns)
df_colors_all

Unnamed: 0,r - g,r - i,r - z,r - F430,r - F515,r - F660,r - F861,g - i,g - z,g - F430,...,z - F430,z - F515,z - F660,z - F861,F430 - F515,F430 - F660,F430 - F861,F515 - F660,F515 - F861,F660 - F861
0,-0.608043,0.387682,0.703924,0.241772,-0.904456,-0.093979,0.484495,0.995725,1.311967,0.849815,...,-0.462152,-1.608380,-0.797903,-0.219429,-1.146228,-0.335751,0.242723,0.810477,1.388951,0.578474
2,-0.820585,0.241729,0.355071,-1.417014,-0.780858,0.025259,0.346118,1.062314,1.175656,-0.596429,...,-1.772085,-1.135929,-0.329812,-0.008953,0.636156,1.442273,1.763132,0.806117,1.126976,0.320859
3,-0.285357,0.037834,0.132790,0.726830,-0.297731,-0.296594,-0.397089,0.323191,0.418147,1.012187,...,0.594040,-0.430521,-0.429384,-0.529879,-1.024561,-1.023424,-1.123919,0.001137,-0.099358,-0.100495
4,-0.618237,0.345549,0.566157,-1.371605,-0.184414,0.105223,0.470211,0.963786,1.184394,-0.753368,...,-1.937762,-0.750571,-0.460934,-0.095946,1.187191,1.476828,1.841816,0.289637,0.654625,0.364988
5,-1.104891,0.391533,0.573214,-2.020830,-1.136394,0.147435,0.523743,1.496424,1.678105,-0.915939,...,-2.594044,-1.709608,-0.425779,-0.049471,0.884436,2.168265,2.544573,1.283829,1.660137,0.376308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749373,-0.898026,0.308211,0.486568,-1.348208,-0.885285,0.125514,0.414990,1.206237,1.384594,-0.450182,...,-1.834776,-1.371853,-0.361054,-0.071578,0.462923,1.473722,1.763198,1.010799,1.300275,0.289476
1749374,-0.544051,0.181739,0.250673,-0.928078,-0.363558,0.019873,0.264968,0.725790,0.794724,-0.384027,...,-1.178751,-0.614231,-0.230800,0.014295,0.564520,0.947951,1.193046,0.383431,0.628526,0.245095
1749375,-1.305456,0.652876,0.923258,-1.848638,-1.341530,0.245180,0.849516,1.958332,2.228714,-0.543182,...,-2.771896,-2.264788,-0.678078,-0.073742,0.507108,2.093818,2.698154,1.586710,2.191046,0.604336
1749382,-0.629174,0.164754,0.233709,-1.133091,-0.478930,0.046050,0.209995,0.793928,0.862883,-0.503917,...,-1.366800,-0.712639,-0.187658,-0.023714,0.654161,1.179142,1.343086,0.524980,0.688925,0.163944


In [142]:
# Ensure X_sample is transformed using the same scaler as X_train
X_sample_scaled = scaler.transform(df_colors_all)

In [143]:
# Predict with the trained model
y_pred_sample = learner.predict(X_sample_scaled)

In [144]:
# Predict probabilities with the trained model
y_prob_sample = learner.predict_proba(X_sample_scaled)

In [145]:
# Print predicted labels for sample data
print("Predicted Labels for Sample Data:\n", y_pred_sample)
# Print predicted probabilities for sample data
print("Predicted Probabilities for Sample Data:\n", y_prob_sample)

# Optionally, evaluate predictions on sample data if ground truth labels are available
# y_true_sample = ...  # Ground truth labels if available
# print("Evaluation on Sample Data - Classification Report:\n", classification_report(y_true_sample, y_pred_sample))
# print("Evaluation on Sample Data - Confusion Matrix:\n", confusion_matrix(y_true_sample, y_pred_sample))


Predicted Labels for Sample Data:
 [1 1 1 ... 1 1 1]
Predicted Probabilities for Sample Data:
 [[0.   1.  ]
 [0.   1.  ]
 [0.02 0.98]
 ...
 [0.   1.  ]
 [0.   1.  ]
 [0.   1.  ]]


In [146]:
# Count number of objects with label 0 and 1 in y_pred_sample
count_label_0 = np.count_nonzero(y_pred_sample == 0)
count_label_1 = np.count_nonzero(y_pred_sample == 1)

# Print the counts
print(f"Number of objects labeled as 0 (GC): {count_label_0}")
print(f"Number of objects labeled as 1 (Non-GC): {count_label_1}")

Number of objects labeled as 0 (GC): 65
Number of objects labeled as 1 (Non-GC): 327014


In [147]:
df_all_clean["Label"] = y_pred_sample

In [148]:
df_all_clean['Prob(GC)'] = y_prob_sample[:,0]
df_all_clean['Prob(Non-CG)'] = y_prob_sample[:,1]

In [149]:
# Example usage after adding columns
print(df_all_clean[['Label', 'Prob(GC)', 'Prob(Non-CG)']].head())  # Print the first few rows for verification

   Label  Prob(GC)  Prob(Non-CG)
0      1      0.00          1.00
2      1      0.00          1.00
3      1      0.02          0.98
4      1      0.00          1.00
5      1      0.00          1.00


In [150]:
# Step 1: Filter the DataFrame for GC classified instances (Label == 0)
df_gc_only = df_all_clean[df_all_clean['Label'] == 0]

# Step 2: Save the filtered DataFrame to a CSV file
df_gc_only.to_csv('predicted_GC_results_only.csv', index=False)  # Adjust the filename as needed
