In [35]:
import sys,os 
sys.path.append('/home/benr/ACT/CW2/py')
import numpy as np 
import pandas as pd 
from PIL import Image, UnidentifiedImageError
from sklearn.preprocessing import LabelEncoder
from functions import get_data, galaxy_type,split_data_torch,train
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Question 2, Convolutional Neural Network for image recognition 
In this section the aim is to repeat the same process as question 1 but replace the traditional reandom forest method with a convolutional neural network. The expectation is that due to the CNN's ability to correlate spacial features, it will be a more roubust method of recognising galaxy morphology.

The first step will be to load the data in exactly the same way as Q1. 

In [36]:

# get the data from functions.py
df = get_data()
#The following ids will be used to identify each galaxy image from SDSS
RA_COL, DEC_COL, ID_COL = 'ra','dec','dr7objid'


In [37]:
#just incase get rid of any nan cells
df_clean = df.dropna(subset=[RA_COL,DEC_COL,ID_COL])
#cut data to 15,000
df_subset = df_clean.sample(n=15000,random_state=11)
#print the shape
print(df_subset.shape)

(15000, 231)


In [38]:
#choose best labels to classify galaxies 
labels = ['t01_smooth_or_features_a01_smooth_debiased',
          't01_smooth_or_features_a02_features_or_disk_debiased',
          't02_edgeon_a04_yes_debiased',
          't04_spiral_a08_spiral_debiased',
          't03_bar_a06_bar_debiased'
          ] 


'''create a new column called 'hubble class' and use the above
funtion to get the class for each row''' 

df_subset['hubble_class'] = df_subset.apply(galaxy_type, axis=1)
df_subset = df_subset[df_subset['hubble_class'].notna()]
df_subset['hubble_class'].value_counts()

hubble_class
Spiral    5894
E         3366
Disk      2436
Name: count, dtype: int64

In [39]:
# input image data into random forest
# attach lables for each image
x_images = []
y_labels = []


# folder that stores the images 
img_dir = '/home/benr/ACT/CW2/sdss_images'
# itterate through each galaxy in the set 
for idx,row in df_subset.iterrows():
    obid = row[ID_COL]
    lbl = row['hubble_class']
    #find corrosponding galaxy in the image folder 
    img_path = os.path.join(img_dir, f"{obid}.jpg")
    if not os.path.exists(img_path):
        continue 
    try:
        #use this error incase of currupted images (optional)
        img = Image.open(img_path).convert("L")
    except UnidentifiedImageError:
        print("Skipping corrupted image:", img_path)
        continue
    #store image as numpy array 
    px_arr = np.array(img,dtype = np.float32) / 255.0 
    #convert to torch tesnsor 
    px_tns = torch.tensor(px_arr).unsqueeze(0)
    #append image and label to x_images and y_labels 
    x_images.append(px_tns)
    y_labels.append(lbl)
# stack images. tensor shape should be (N,1,256,256)
x_images = torch.stack(x_images)
m = x_images.mean()
std = x_images.std()
x_images = (x_images - m) / std
print(x_images.shape)

torch.Size([11693, 1, 256, 256])


For the CNN the lables will need to be encoded into numerical values. 
Documentation can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [40]:
# define encoder 
lb_enc = LabelEncoder()
#transform the current y_labels list
y_labels = lb_enc.fit_transform(y_labels)
#convert list into a torch tensor 
y_labels = torch.tensor(y_labels) 
print(len(y_labels.unique()))

3


Now we set up the class that will hold the structure of the network. Galaxy_CNN takes the number of input channels, the number of classes (Prediction catagories) and passes them two the different layers of the network. 
Throughout the process of data being passed through different levels of the CNN we will need to keep trach of the network is change the size of each tensor. This can be done with the following formulas

* CNN Output - $$\frac{H + 2P -K}{s} + 1 $$
* Pooling Output - $$\frac{H-K}{s}  + 1$$

H = input height of width of image 


K = kernel size

P = padding size

s = stride 

In [41]:

class Galaxy_CNN(nn.Module):
    def __init__(self,inCH, nCL):  
        
        super(Galaxy_CNN,self).__init__()
        #first convolution layer
        self.conv1 = nn.Conv2d(1, inCH,kernel_size= 3,stride=1,padding =2)
        #first polling layer
        self.conv2 = nn.Conv2d(inCH,inCH*3, kernel_size=6,stride = 1,padding=2)
        #linear output layer
        self.lin = nn.Linear(inCH*3 * 64**2,nCL)
        self.pool = nn.MaxPool2d(2,2)
        self.dropout = nn.Dropout(0.25)
    def forward(self,x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.dropout(x)
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0),-1)
        x = self.dropout(x)
        x = self.lin(x)
        return x 

In [42]:
#split tensor into training and test sets (see functions.py)
I = np.arange(len(y_labels))
xtrn,xtst,ytrn,ytst = split_data_torch(x_images,y_labels,I)

train_ds = TensorDataset(xtrn, ytrn)
test_ds  = TensorDataset(xtst, ytst)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=16, shuffle=False)




In [43]:
num_classes = len(y_labels.unique())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Galaxy_CNN(64,num_classes).to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.NAdam(model.parameters(),lr=10**-4,weight_decay=0.00001)




In [44]:
train(model, train_loader, test_loader, criterion, optimizer, device, n_epoch=60)


Epoch 1/60 Train loss: 0.9436  Train acc: 0.585  Test acc: 0.645
Epoch 2/60 Train loss: 0.7440  Train acc: 0.689  Test acc: 0.625
Epoch 3/60 Train loss: 0.6210  Train acc: 0.742  Test acc: 0.650
Epoch 4/60 Train loss: 0.4633  Train acc: 0.819  Test acc: 0.627


KeyboardInterrupt: 