- # Merge, clean, analyze, save dataset

In [1]:
import pandas as pd
import numpy as np

# Load the datasets directly as CSVs
df1 = pd.read_csv('https://raw.githubusercontent.com/Adrita-Khan/GRB-ML/main/Data/GBM_Known_Redshift.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/Adrita-Khan/GRB-ML/main/Data/GBM_data_full.csv')

# Display basic info about the datasets
print("Dataset 1 (CSV) shape:", df1.shape)
print("Dataset 1 columns:", df1.columns.tolist())
print("\nDataset 2 (CSV) shape:", df2.shape)
print("Dataset 2 columns:", df2.columns.tolist())

Dataset 1 (CSV) shape: (127, 2)
Dataset 1 columns: ['name', 'Redshift']

Dataset 2 (CSV) shape: (4076, 306)
Dataset 2 columns: ['name', 'ra', 'dec', 'trigger_time', 't90', 't90_error', 't90_start', 'fluence', 'fluence_error', 'flux_1024', 'flux_1024_error', 'flux_1024_time', 'flux_64', 'flux_64_error', 'flnc_band_ampl', 'flnc_band_ampl_pos_err', 'flnc_band_ampl_neg_err', 'flnc_band_epeak', 'flnc_band_epeak_pos_err', 'flnc_band_epeak_neg_err', 'flnc_band_alpha', 'flnc_band_alpha_pos_err', 'flnc_band_alpha_neg_err', 'flnc_band_beta', 'flnc_band_beta_pos_err', 'flnc_band_beta_neg_err', 'flnc_spectrum_start', 'flnc_spectrum_stop', 'pflx_best_fitting_model', 'pflx_best_model_redchisq', 'flnc_best_fitting_model', 'flnc_best_model_redchisq', 'actual_1024ms_interval', 'actual_256ms_interval', 'actual_64ms_interval', 'back_interval_high_start', 'back_interval_high_stop', 'back_interval_low_start', 'back_interval_low_stop', 'bcat_detector_mask', 'bcatalog', 'bii', 'duration_energy_high', 'durati

In [2]:
# Check if 'name' column exists in both datasets
if 'name' not in df1.columns:
    print("\nWarning: 'name' column not found in dataset 1")
    print("Available columns:", df1.columns.tolist())

if 'name' not in df2.columns:
    print("\nWarning: 'name' column not found in dataset 2")
    print("Available columns:", df2.columns.tolist())

In [3]:
# Merge datasets on 'name' column (inner join to keep only common names)
merged_df = pd.merge(df1, df2, on='name', how='inner', suffixes=('_df1', '_df2'))

print(f"\nAfter merging on common 'name' values:")
print(f"Merged dataset shape: {merged_df.shape}")

# Remove rows with missing values
cleaned_df = merged_df.dropna()

print(f"\nAfter removing rows with missing values:")
print(f"Cleaned dataset shape: {cleaned_df.shape}")


After merging on common 'name' values:
Merged dataset shape: (127, 307)

After removing rows with missing values:
Cleaned dataset shape: (124, 307)


In [4]:
# Drop all columns with string (object) data types
cleaned_df = cleaned_df.drop(columns=cleaned_df.select_dtypes(include=['object']).columns)

print(f"\nAfter dropping string columns:")
print(f"Final dataset shape: {cleaned_df.shape}")


After dropping string columns:
Final dataset shape: (124, 293)


In [5]:
# # --- Remove duplicate columns ---
# # Find duplicate column names
# duplicates = cleaned_df.columns[cleaned_df.columns.duplicated()]
# print("Duplicate column names:", duplicates.tolist())

# # Transpose the DataFrame and look for duplicated columns
# duplicate_cols = cleaned_df.T.duplicated()

# # Drop duplicated columns
# cleaned_df = cleaned_df.loc[:, ~duplicate_cols]

# # Print remaining columns
# print("\nAfter removing duplicate columns:")
# print(f"Current number of columns: {cleaned_df.shape[1]}")

In [6]:
# Display first few rows and summary info
# print(f"\nFirst 5 rows of final dataset:")
# print(cleaned_df.head())

# print(f"\nDataset info:")
# print(cleaned_df.info())

# print(f"\nMissing values check:")
# print(cleaned_df.isnull().sum().sum())  # Should be 0

In [7]:
# Save the final cleaned dataset as a NumPy array
cleaned_array = cleaned_df.to_numpy()

# Optionally save it as a .npy file
np.save('merged_cleaned_dataset.npy', cleaned_array)
print("\nFinal dataset saved as 'merged_cleaned_dataset.npy'")

# Save as CSV too
cleaned_df.to_csv('merged_cleaned_dataset.csv', index=False)
print("\nFinal dataset saved as 'merged_cleaned_dataset.csv'")


Final dataset saved as 'merged_cleaned_dataset.npy'

Final dataset saved as 'merged_cleaned_dataset.csv'
