# Preprocessing (Lean)
This lean variant provides functions that can be used by sister notebooks, and has no visualisations to make running it faster.

In [1]:
'''
Utilities
'''

SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
Z_SCORE_THRESHOLD = 3

In [2]:
import pandas as pd

stellar_df = pd.read_csv('star_classification.csv')

In [3]:
'''
Keeping desired features
'''

desired_features = ["u", "g", "r", "i", "z", "redshift"]
stellar_df = stellar_df[desired_features + ["class"]]

In [4]:
'''
Outlier removal
'''

import numpy as np
from scipy.stats import zscore

z_scores = stellar_df[desired_features].apply(zscore)
stellar_df = stellar_df[(np.abs(z_scores) < Z_SCORE_THRESHOLD).all(axis=1)]

In [5]:
'''
Renaming Features for Readability
'''
new_names = {
    "u": "ultraviolet",
    "g": "green",
    "r": "red",
    "i": "near_infrared",
    "z": "infrared",
}
stellar_df = stellar_df.rename(columns=new_names)

In [6]:
'''
Preprocessing
'''

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from dataclasses import dataclass

CLASS_LABELS = {}

def get_full_data():
    X = stellar_df[['ultraviolet', 'green', 'red', 'near_infrared', 'infrared', 'redshift']].copy()
    y = stellar_df["class"].copy()

    le = LabelEncoder()
    y_labeled = le.fit_transform(y)

    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y_labeled, test_size=TEST_SIZE, random_state=SEED, stratify=y_labeled)

    CLASS_LABELS = le.classes_

    X_trn, X_val, y_trn, y_val = train_test_split(X_trn, y_trn, test_size=VAL_SIZE, random_state=SEED, stratify=y_trn)

    scaler = StandardScaler()
    scaler.fit(X_trn)
    X_trn_scaled = scaler.transform(X_trn)
    X_val_scaled = scaler.transform(X_val)
    X_tst_scaled = scaler.transform(X_tst)

    return (X_trn_scaled, y_trn), (X_val_scaled, y_val), (X_tst_scaled, y_tst)


In [7]:
'''
Isolated retrieval step to not need to rerun it often.
'''

(X_trn, y_trn), (X_val, y_val), (X_tst, y_tst) = get_full_data()