In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv('/content/sample_data/updated_sky_server.csv')

# Feature columns: we'll use magnitudes (u, g, r, i, z), ra, dec, and redshift
features = ['u', 'g', 'r', 'i', 'z', 'ra', 'dec', 'redshift']
X = df[features]

# Handle missing data in features (filling NaNs with the mean of each column)
X.fillna(X.mean(), inplace=True)

# Handle missing data in the target column 'class' (drop rows with NaN in 'class')
df.dropna(subset=['class'], inplace=True)

# Create labels based on the 'class' column (this is the true label we want to check against)
# Map the 'class' column into integers (0 = star, 1 = galaxy, 2 = qso)
y_true = df['class'].map({'STAR': 0, 'GALAXY': 1, 'QSO': 2})

# Handle any possible NaN values that could be present after the mapping step
y_true.dropna(inplace=True)

# Preprocessing: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_true, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting model
model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model against the true class labels
print(classification_report(y_test, y_pred, target_names=['star', 'galaxy', 'qso']))
