In [247]:
import os
import pandas as pd
import numpy as np
import numpy.char as char
from sklearn.preprocessing import OneHotEncoder

In [248]:
with open('../data/TOI_2025.10.03_12.20.22.csv') as toi:
    toi_df = pd.read_csv(toi)

In [249]:
###this drops columns with >90% null values
toi_df = toi_df.dropna(thresh=len(toi_df)*0.1, axis=1)

In [250]:
toi_df = toi_df.drop(columns=["toi", "tid", "toi_created", "rowupdate"])

In [251]:
# Function to convert Right Ascension string to seconds
def time_to_seconds(time_str):
    """Convert Right Ascension string (HHhMMmSS.sss) to total seconds"""
    if pd.isna(time_str):
        return None
    try:
        # Extract hours, minutes, seconds using regex
        pattern = r'(\d+)h(\d+)m([\d.]+)s'
        match = re.match(pattern, str(time_str))
        if not match:
            print(f"Warning: Could not parse RA value: {time_str}")
            return None
        hours = float(match.group(1))
        minutes = float(match.group(2))
        seconds = float(match.group(3))
        
        # Validate ranges for RA (0-24h)
        if not (0 <= hours < 24 and 0 <= minutes < 60 and 0 <= seconds < 60):
            print(f"Warning: RA values out of range: {time_str}")
            return None
            
        # Convert everything to seconds
        total_seconds = (hours * 3600) + (minutes * 60) + seconds
        return total_seconds
    except (ValueError, AttributeError) as e:
        print(f"Error converting {time_str}: {str(e)}")
        return None

toi_df['rastr'] = toi_df['rastr'].apply(time_to_seconds)


In [252]:
# Function to convert Declination string to decimal degrees
def dec_to_degrees(dec_str):
    """Convert Declination string (DDdMMmSS.sss) to decimal degrees"""
    if pd.isna(dec_str):
        return None
    try:
        # Extract degrees, minutes, seconds using regex
        pattern = r'([+-]?\d+)d(\d+)m([\d.]+)s'
        match = re.match(pattern, str(dec_str))
        if not match:
            print(f"Warning: Could not parse DEC value: {dec_str}")
            return None
        
        degrees = float(match.group(1))
        minutes = float(match.group(2))
        seconds = float(match.group(3))
        
        # Validate ranges for DEC (-90 to +90 degrees)
        if not (-90 <= degrees <= 90 and 0 <= minutes < 60 and 0 <= seconds < 60):
            print(f"Warning: DEC values out of range: {dec_str}")
            return None
        
        # Convert to decimal degrees
        dec_degrees = degrees + (minutes/60) + (seconds/3600)
        if dec_degrees < -90 or dec_degrees > 90:
            print(f"Warning: Final DEC value out of range: {dec_degrees}")
            return None
            
        return dec_degrees
        
    except (ValueError, AttributeError) as e:
        print(f"Error converting {dec_str}: {str(e)}")
        return None

# Convert decstr to decimal degrees
print("Converting DEC strings to decimal degrees...")
print("\nSample of original values:")
print(toi_df['decstr'].head())

toi_df['decstr'] = toi_df['decstr'].apply(dec_to_degrees)


Converting DEC strings to decimal degrees...

Sample of original values:
0    -12d41m45.46s
1    -05d30m49.87s
2    -10d34m49.64s
3    -25d12m25.26s
4    -48d48m10.12s
Name: decstr, dtype: object


In [253]:

categorical_columns = toi_df.select_dtypes(include=['object']).columns.tolist()


# Perform one-hot encoding with NaN handling
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fill NaN values with 'missing' before encoding
cat_data = toi_df[categorical_columns].fillna('missing')
one_hot_encoded = encoder.fit_transform(cat_data)

# Create DataFrame with encoded columns
feature_names = encoder.get_feature_names_out(categorical_columns)
one_hot_df = pd.DataFrame(
    one_hot_encoded, 
    columns=feature_names,
    index=toi_df.index
)

# Get numeric columns to keep
numeric_columns = toi_df.select_dtypes(include=['int64', 'float64']).columns

# Combine numeric and encoded categorical columns
df_encoded = pd.concat([
    toi_df[numeric_columns],  # Keep numeric columns
    one_hot_df  # Add encoded categorical columns
], axis=1)

print("\nShape of encoded DataFrame:", df_encoded.shape)
print("\nColumns in encoded DataFrame:", df_encoded.columns.tolist())


Shape of encoded DataFrame: (7703, 60)

Columns in encoded DataFrame: ['rastr', 'ra', 'decstr', 'dec', 'st_pmra', 'st_pmraerr1', 'st_pmraerr2', 'st_pmralim', 'st_pmdec', 'st_pmdecerr1', 'st_pmdecerr2', 'st_pmdeclim', 'pl_tranmid', 'pl_tranmiderr1', 'pl_tranmiderr2', 'pl_tranmidlim', 'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim', 'pl_trandurh', 'pl_trandurherr1', 'pl_trandurherr2', 'pl_trandurhlim', 'pl_trandep', 'pl_trandeperr1', 'pl_trandeperr2', 'pl_trandeplim', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_insol', 'pl_eqt', 'st_tmag', 'st_tmagerr1', 'st_tmagerr2', 'st_tmaglim', 'st_dist', 'st_disterr1', 'st_disterr2', 'st_distlim', 'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_logg', 'st_loggerr1', 'st_loggerr2', 'st_logglim', 'st_rad', 'st_raderr1', 'st_raderr2', 'st_radlim', 'tfopwg_disp_APC', 'tfopwg_disp_CP', 'tfopwg_disp_FA', 'tfopwg_disp_FP', 'tfopwg_disp_KP', 'tfopwg_disp_PC']


In [254]:
# Create correlation matrix only for numeric columns
numeric_df = df_encoded.select_dtypes(include=['float64', 'int64'])
corr_matrix = numeric_df.corr()

# Find highly correlated features
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column].abs() > 0.7)]

# Drop highly correlated features from the main dataframe
x = df_encoded.drop(columns=['tfopwg_disp_APC', 'tfopwg_disp_CP', 'tfopwg_disp_FA', 
                            'tfopwg_disp_FP', 'tfopwg_disp_KP', 'tfopwg_disp_PC'] + to_drop)

# Define target variables
y = df_encoded[['tfopwg_disp_APC', 'tfopwg_disp_CP', 'tfopwg_disp_FA', 
                'tfopwg_disp_FP', 'tfopwg_disp_KP', 'tfopwg_disp_PC']]

print("\nFeatures dropped due to high correlation:", to_drop)
print("\nShape of X after dropping correlated features:", x.shape)
print("Shape of y:", y.shape)


Features dropped due to high correlation: ['ra', 'dec', 'st_pmraerr2', 'st_pmdecerr1', 'st_pmdecerr2', 'pl_tranmiderr2', 'pl_orbpererr2', 'pl_trandurherr2', 'pl_trandeperr2', 'pl_radeerr2', 'st_tmagerr2', 'st_disterr1', 'st_disterr2', 'st_tefferr2', 'st_loggerr2', 'st_rad', 'st_raderr2']

Shape of X after dropping correlated features: (7703, 37)
Shape of y: (7703, 6)


In [255]:
dataframe = pd.concat([x, y], axis=1)
dir = os.getcwd()
dir_path = dir.replace('src', 'data\\processed_toi.csv')
dataframe.to_csv(dir_path, index=False)