In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile

# Read in data
df = pd.read_csv('../input/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.zip')

df_train, df_test = train_test_split(df, test_size=.2)

# Checking shape of training and testing dataframes
df_train.shape, df_test.shape
    
with zipfile.ZipFile("../input/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip","r") as z:
    z.extractall(".")


In [None]:
from skimage.transform import resize
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
          
# Image dimensions for preprocessing
# After inspecting the images, it was seen that all the galaxies are centred in each of the images and is surrounding by a large
# amount of black space. This would make training unnecessarily expensive and is therefore cropped out. 
ORIG_SHAPE = (424,424)
CROP_SIZE = (256,256)
IMG_SHAPE = (64,64)

# Processes each image
def get_image(path, x1,y1, shape, crop_size):
    x = plt.imread(path)
    x = x[x1:x1+crop_size[0], y1:y1+crop_size[1]]
    x = resize(x, shape)
    x = x/255.
    return x
    
# Processes all images and returns test and train data
def get_all_images(dataframe, shape=IMG_SHAPE, crop_size=CROP_SIZE):
    x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2
    y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2
   
    sel = dataframe.values
    ids = sel[:,0].astype(int).astype(str)
    y_batch = sel[:,1:]
    x_batch = []
    for i in tqdm(ids):
        x = get_image('images_training_rev1/'+i+'.jpg', 
                          x1,y1, shape=shape, crop_size=crop_size)
        x_batch.append(x)
    x_batch = np.array(x_batch)
    return x_batch, y_batch
        
X_train, y_train = get_all_images(df_train)
X_test, y_test = get_all_images(df_test)

