In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
from PIL import Image
from data.utils import show_random

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.optimizers import Adadelta

# Read the data: morphological labels

Labels, assigned visually by astronomers in the GAMA collaboration:

In [None]:
morph = pd.read_csv(os.path.join("data","morphology.txt"), sep=" ")

There are two distinct labels, with no info on self-consistency: HubbleType and isElliptical

In [None]:
morph.head()

2451 galaxies do not have a HubbleType:

In [None]:
morph.HubbleType.value_counts()

In [None]:
morph.isElliptical.value_counts()

# Process the labels

Our goal will be to develop a model which can predict a correct label given a galaxy image.

Let's focus on predicting the `isElliptical` label, and take a random sample of 2500 galaxies with the label "Elliptical" and 2500 with the label "NotElliptical".  We will also need to select the corresponding images.

In [None]:
mask = morph.isElliptical == "NotElliptical"
df0 = morph[mask].sample(2500, random_state=0)
df0.head()

In [None]:
mask = morph.isElliptical == "Elliptical"
df1 = morph[mask].sample(2500, random_state=0)
df1.head()

Merge the data frames and check it is sensible:

In [None]:
data = pd.concat( (df0,df1) )

In [None]:
data.isElliptical.value_counts()

Create an array of integer labels, i.e. convert the string labels 'Elliptical' and 'NotElliptical' to integers

In [None]:
labdict = { 'NotElliptical':0, 'Elliptical':1 }
labels = np.array( [ labdict[s] for s in data.isElliptical ] )

# Read the data: galaxy images

Read the images associated with our subset of the label data (with IDs lining up row by row)

In [None]:
loa = [ np.array( Image.open(os.path.join("data","images","{}_giH.png").format(i)), dtype=np.uint8 ) for i in data.id ]
images = np.array( loa )

There are 5000 total images, and each one has size 28x28x3 pixels:

In [None]:
images.shape

Currently, the image data is stored as integer values in the range of 0 to 255.  For machine learning applications, we need to rescale this data to the range 0 to 1 and convert to float.

In [None]:
print( images.min(), images.max() )

In [None]:
images = np.float32(images)/255.

In [None]:
print( images.min(), images.max() )

# Inspect the data

To recap, our data has been processed into two numpy arrays: `images` and `labels`.

Let's look at some random galaxies in the dataset along with their label (0=NotElliptical, 1=Elliptical)

In [None]:
show_random(images, labels )

# Build the CNN

In [None]:
images.shape[1:]

In [None]:
def build( input_shape=images.shape[1:], num_classes=len(np.unique(labels)) ):
    # note the input shape is simply the shape of 'x' without the first dimension = (50,50,1)
    # i.e. the number of datapoints in the training set does not matter
        
    model = Sequential()
    
    # Layers:
    model.add(Conv2D(3, input_shape=input_shape, kernel_size=(3, 3), activation='relu'))
    #model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(3, (3, 3), activation='relu'))
    #model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(4, (2, 2), activation='relu'))
    #model.add(MaxPooling2D(pool_size=(3, 3)))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    #model.add(Dense(128, activation='relu'))
    #model.add(Dropout(0.5))
    
    # Final layer (fully connected)
    if num_classes == 2:
        model.add( Dense(1, activation='sigmoid') )
        model.compile( optimizer=Adadelta(), loss=binary_crossentropy, metrics=['accuracy'] )
    elif num_classes > 2:
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(optimizer=Adadelta(), loss=categorical_crossentropy, metrics=['accuracy'])
    
    return model

In [None]:
model = build()
model.summary()

# Train the model

Be sure to reserve some of the data for validation

In [None]:
model = build()
history = model.fit( images, labels, batch_size=128, epochs=30, verbose=1, validation_split=0.2 )

# Watch as the training accuracy begins at 50% and slowly climbs to around 90%.  Validation accuracy is similar.

# Plot the training history 

In [None]:
import matplotlib.pyplot as plt
import pylab
history_dict = history.history

f, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12), dpi= 80)

ax1.plot(history_dict['loss'], 'o--', label='Training')
ax1.plot(history_dict['val_loss'], 'o--', label='Validation')
ax1.set_xlabel('Number of Epocs')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(history_dict['acc'], 'o--', label='Training')
ax2.plot(history_dict['val_acc'], 'o--', label='Validation')
ax2.set_xlabel('Number of Epocs')
ax2.set_ylabel('Accuracy')
ax2.legend()

# Inspect the predictions

- The predictions are probabilities between 0 and 1 that the given galaxy is an Elliptical.

In [None]:
predictions = model.predict( images )[:,0]   # need to subset to get the correct shape

In [None]:
predictions

In [None]:
show_random(images, labels, predictions)

## Your turn:
- confusion matrix
- plot the distribution of predicted probabilities for each class
- plot which images are misclassified; develop intuition for improving the model
- consider tweaking the architecture, e.g. see how different Conv2D or Dropout or MaxPooling affects the result

- the dataframe you loaded has more granular data, i.e. the HubbleType. See if you can implement a CNN to predict more than one class
