In [6]:
# Install required libraries
!pip install catboost scikit-learn

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from google.colab import files

# Loading the dataset
uploaded = files.upload()
train_data = pd.read_csv("cosmicclassifierTraining.csv")
uploaded = files.upload()
test_data = pd.read_csv("cosmicclassifierFinal.csv")

# Preprocess the data
# Remove spaces in column names
train_data.columns = train_data.columns.str.replace(" ", "")
test_data.columns = test_data.columns.str.replace(" ", "")

# Handle blank values and noise values
# Remove rows with large negative values (-999999) or NaN values
train_data = train_data[(train_data != -999999).all(axis=1)]  # Remove rows with -999999
train_data = train_data.dropna()  # Remove rows with NaN values
test_data = test_data[(test_data != -999999).all(axis=1)]  # Remove rows with -999999
test_data = test_data.dropna()  # Remove rows with NaN values

# Print the number of rows that survived the cleaning
print(f"Number of rows in training data after cleaning: {len(train_data)}")
print(f"Number of rows in test data after cleaning: {len(test_data)}")

# Remove rows with empty prediction values in the training data
train_data = train_data.dropna(subset=["Prediction"])  # Fix typo: "Prediction" not "Prediction"

# Separate features and target
X = train_data.drop(columns=["Prediction"], errors='ignore')
y = train_data["Prediction"]

# Identify categorical columns
cat_cols = [col for col in X.columns if "Category" in col or X[col].dtype == "object"]

# Get indices of categorical columns
cat_indices = [X.columns.get_loc(col) for col in cat_cols]

# Normalize numerical features
scaler = MinMaxScaler()
num_cols = [col for col in X.columns if col not in cat_cols]
X[num_cols] = scaler.fit_transform(X[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

# Convert all columns to numeric values (except categorical columns)
for col in num_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")
    test_data[col] = pd.to_numeric(test_data[col], errors="coerce")

# Fill any remaining NaN values
X[num_cols] = X[num_cols].fillna(0)
test_data[num_cols] = test_data[num_cols].fillna(0)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define the CatBoost model with the provided hyperparameters
params = {
    'iterations': 415,
    'learning_rate': 0.20776868309972968,
    'depth': 9,
    'l2_leaf_reg': 5.683520927912665,
    'border_count': 236,
    'random_strength': 6.20026949122612,
    'bagging_temperature': 0.716276329480144,
    'verbose': 100,  # Print progress every 100 iterations
    'random_state': 42
}

# Initialize the CatBoostClassifier
model = CatBoostClassifier(**params)

# Train the model
model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_indices, early_stopping_rounds=50)

# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Train the final model on the full dataset
model.fit(X, y, cat_features=cat_indices)

# Predict on the test set
# Create a Pool object for the test data to handle categorical features
test_pool = Pool(data=test_data, cat_features=cat_indices)
test_predictions = model.predict(test_pool)

# Map numerical predictions to planet categories
planet_categories = {
    0.0: "Bewohnbar",
    1.0: "Terraformierbar",
    2.0: "Rohstoffreich",
    3.0: "Wissenschaftlich",
    4.0: "Gasriese",
    5.0: "Wüstenplanet",
    6.0: "Eiswelt",
    7.0: "Toxischetmosäre",
    8.0: "Hohestrahlung",
    9.0: "Toterahswelt"
}

# Convert numerical predictions to category names
test_predictions_categories = [planet_categories[float(pred)] for pred in test_predictions]

# Save predictions in the required format
submission = pd.DataFrame({
    "Planet_ID": test_data.index,
    "Predicted_Class": test_predictions_categories
})
submission.to_csv("submission.csv", index=False)

# Download the submission file
files.download("submission.csv")



Saving cosmicclassifierTraining.csv to cosmicclassifierTraining (5).csv


Saving cosmicclassifierFinal.csv to cosmicclassifierFinal (2).csv
Number of rows in training data after cleaning: 34059
Number of rows in test data after cleaning: 10000
0:	learn: 1.7332550	test: 1.7373639	best: 1.7373639 (0)	total: 984ms	remaining: 6m 47s
100:	learn: 0.2473657	test: 0.3209542	best: 0.3209542 (100)	total: 1m 33s	remaining: 4m 51s
200:	learn: 0.1610075	test: 0.2731812	best: 0.2731812 (200)	total: 3m 15s	remaining: 3m 27s
300:	learn: 0.1206362	test: 0.2586852	best: 0.2586784 (299)	total: 4m 57s	remaining: 1m 52s
400:	learn: 0.0900847	test: 0.2514518	best: 0.2514518 (400)	total: 6m 40s	remaining: 14s
414:	learn: 0.0865667	test: 0.2500624	best: 0.2500624 (414)	total: 6m 54s	remaining: 0us

bestTest = 0.2500623738
bestIteration = 414

Validation Accuracy: 0.9149
0:	learn: 1.6634372	total: 1.16s	remaining: 8m
100:	learn: 0.2504339	total: 1m 51s	remaining: 5m 47s
200:	learn: 0.1599746	total: 3m 54s	remaining: 4m 9s
300:	learn: 0.1179396	total: 6m	remaining: 2m 16s
400:	learn:

  test_predictions_categories = [planet_categories[float(pred)] for pred in test_predictions]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>