In [13]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


In [2]:
# 1.1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 1.2: Set base directory
base_dir = '/content/drive/MyDrive/galaxy-zoo-the-galaxy-challenge'

In [4]:
import zipfile
import os

zip_file_path_3 = '/content/drive/MyDrive/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip'

extract_folder_3 = '/content/drive/MyDrive/galaxy-zoo-the-galaxy-challenge/images_test'


with zipfile.ZipFile(zip_file_path_3, 'r') as zip_ref:
    zip_ref.extractall(extract_folder_3)

In [5]:
# 3.1: Load training solutions
solutions_path = os.path.join(base_dir, 'training_solutions_rev1.csv')
df = pd.read_csv(solutions_path)
print("Head of solutions file:")
print(df.head())

Head of solutions file:
   GalaxyID  Class1.1  Class1.2  Class1.3  Class2.1  Class2.2  Class3.1  \
0    100008  0.383147  0.616853  0.000000  0.000000  0.616853  0.038452   
1    100023  0.327001  0.663777  0.009222  0.031178  0.632599  0.467370   
2    100053  0.765717  0.177352  0.056931  0.000000  0.177352  0.000000   
3    100078  0.693377  0.238564  0.068059  0.000000  0.238564  0.109493   
4    100090  0.933839  0.000000  0.066161  0.000000  0.000000  0.000000   

   Class3.2  Class4.1  Class4.2  ...  Class9.3  Class10.1  Class10.2  \
0  0.578401  0.418398  0.198455  ...  0.000000   0.279952   0.138445   
1  0.165229  0.591328  0.041271  ...  0.018764   0.000000   0.131378   
2  0.177352  0.000000  0.177352  ...  0.000000   0.000000   0.000000   
3  0.129071  0.189098  0.049466  ...  0.000000   0.094549   0.000000   
4  0.000000  0.000000  0.000000  ...  0.000000   0.000000   0.000000   

   Class10.3  Class11.1  Class11.2  Class11.3  Class11.4  Class11.5  Class11.6  
0   0.00000

In [18]:
train_images = os.path.join(base_dir, 'images_training_rev1')
test_images = os.path.join(base_dir, 'images_test')

In [6]:
# 3.2: Map probabilities to labels (elliptical=0, spiral=1, irregular=2)
class_map = {'Class1.1': 0, 'Class1.2': 1, 'Class1.3': 2}
df['max_class'] = df[['Class1.1', 'Class1.2', 'Class1.3']].idxmax(axis=1)
df['label'] = df['max_class'].map(class_map)

In [7]:
df.head()

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6,max_class,label
0,100008,0.383147,0.616853,0.0,0.0,0.616853,0.038452,0.578401,0.418398,0.198455,...,0.138445,0.0,0.0,0.092886,0.0,0.0,0.0,0.325512,Class1.2,1
1,100023,0.327001,0.663777,0.009222,0.031178,0.632599,0.46737,0.165229,0.591328,0.041271,...,0.131378,0.45995,0.0,0.591328,0.0,0.0,0.0,0.0,Class1.2,1
2,100053,0.765717,0.177352,0.056931,0.0,0.177352,0.0,0.177352,0.0,0.177352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Class1.1,0
3,100078,0.693377,0.238564,0.068059,0.0,0.238564,0.109493,0.129071,0.189098,0.049466,...,0.0,0.094549,0.189098,0.0,0.0,0.0,0.0,0.0,Class1.1,0
4,100090,0.933839,0.0,0.066161,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Class1.1,0


In [9]:
# Filter out irregular galaxies (label == 2)
df_filtered = df[df['label'].isin([0, 1])]
print("Class distribution after removing irregular galaxies:")
print(df_filtered['label'].value_counts())

Class distribution after removing irregular galaxies:
label
1    34826
0    26693
Name: count, dtype: int64


In [10]:
# Select 500 images per class (elliptical=0, spiral=1)
balanced_df = pd.concat([
    df_filtered[df_filtered['label'] == 0].sample(n=500, random_state=42),  # Class 0: 500 samples
    df_filtered[df_filtered['label'] == 1].sample(n=500, random_state=42)   # Class 1: 500 samples
])

In [11]:
# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the new class distribution
print("Balanced subset class distribution (elliptical and spiral only):")
print(balanced_df['label'].value_counts())

Balanced subset class distribution (elliptical and spiral only):
label
1    500
0    500
Name: count, dtype: int64


In [12]:
# 4.1: Extract GalaxyIDs and labels
galaxy_ids = balanced_df['GalaxyID'].values
labels = balanced_df['label'].values

In [16]:
# Split into training and validation sets (80/20 split)
train_ids, val_ids, train_labels, val_labels = train_test_split(
    galaxy_ids, labels, test_size=0.2, random_state=42, stratify=labels
)
print("Number of training samples:", len(train_ids))
print("Number of validation samples:", len(val_ids))


Number of training samples: 800
Number of validation samples: 200


In [20]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# Function to load images
def load_image(galaxy_id, label):
    filepath = os.path.join(train_images, str(galaxy_id) + '.jpg')
    image = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32) / 255.0  # Normalize to [0, 1]
    label = tf.one_hot(label, depth=2)  # Two classes only
    return image, label

In [None]:
# 4.4: Create tf.data Datasets
train_ds = tf.data.Dataset.from_tensor_slices((train_ids, train_labels))
train_ds = train_ds.map(load_image).shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((val_ids, val_labels))
val_ds = val_ds.map(load_image).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)