# Chargement des id et des probas

In [1]:
import pandas as pd
df = pd.read_csv("data/training_solutions_rev1.csv", usecols=[0,1,2,3])
df

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3
0,100008,0.383147,0.616853,0.000000
1,100023,0.327001,0.663777,0.009222
2,100053,0.765717,0.177352,0.056931
3,100078,0.693377,0.238564,0.068059
4,100090,0.933839,0.000000,0.066161
...,...,...,...,...
61573,999948,0.510379,0.489621,0.000000
61574,999950,0.901216,0.098784,0.000000
61575,999958,0.202841,0.777376,0.019783
61576,999964,0.091000,0.909000,0.000000


# Suppression de la classe 3 qui n'a que 73 images...

In [2]:
df = df.drop(df[(df['Class1.3'] > df['Class1.2']) & (df['Class1.3'] > df['Class1.1'])].index)
df = df.drop(['Class1.3'], axis=1)
df.shape

(61519, 3)

# Échantillons test et d'entraînement

In [3]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=.35)
df_train.shape, df_test.shape

((39987, 3), (21532, 3))

# Création d'un second dossier data en modifiant l'arborescence pour pouvoir utiliser ImageDataGenerator

In [5]:
import os
import shutil

# Si existant on supprime le dossier
if os.path.exists('data2'):
    shutil.rmtree('data2')

# Création des dossiers avec l'arborescence souhaitée
os.mkdir('data2')

os.mkdir('data2/training_set')
os.mkdir('data2/training_set/class1')
os.mkdir('data2/training_set/class2')

os.mkdir('data2/test_set')
os.mkdir('data2/test_set/class1')
os.mkdir('data2/test_set/class2')

# copie des images du test set   
df_test.GalaxyID[(df['Class1.1'] > df['Class1.2'])].apply(lambda x: shutil.copy(f'data/images_training_rev1/{x}.jpg',
                                                                                f'data2/test_set/class1/{x}.jpg'))
df_test.GalaxyID[(df['Class1.2'] > df['Class1.1'])].apply(lambda x: shutil.copy(f'data/images_training_rev1/{x}.jpg',
                                                                                f'data2/test_set/class2/{x}.jpg'))

# copie des images du training set
df_train.GalaxyID[(df['Class1.1'] > df['Class1.2'])].apply(lambda x: shutil.copy(f'data/images_training_rev1/{x}.jpg',
                                                                                 f'data2/training_set/class1/{x}.jpg'))
df_train.GalaxyID[(df['Class1.2'] > df['Class1.1'])].apply(lambda x: shutil.copy(f'data/images_training_rev1/{x}.jpg',
                                                                                 f'data2/training_set/class2/{x}.jpg'))

51856    data2/training_set/class2/855595.jpg
27222    data2/training_set/class2/498801.jpg
6729     data2/training_set/class2/199427.jpg
53879    data2/training_set/class2/885753.jpg
352      data2/training_set/class2/105117.jpg
                         ...                 
11935    data2/training_set/class2/276836.jpg
12957    data2/training_set/class2/291905.jpg
53064    data2/training_set/class2/873872.jpg
60239    data2/training_set/class2/980469.jpg
21664    data2/training_set/class2/416436.jpg
Name: GalaxyID, Length: 22532, dtype: object

# CNN par Transfer Learning

In [6]:
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, Flatten
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator

classifier_vgg16 = VGG16(weights='imagenet', include_top=False, input_shape=(64, 64, 3))

for layer in classifier_vgg16.layers:
    layer.trainable = False

flat = Flatten()(classifier_vgg16.output)
classif = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat)
output = Dense(1, activation='sigmoid')(classif)

classifier_vgg16 = Model(inputs=classifier_vgg16.inputs, outputs=output)

classifier_vgg16.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entraînement
train_datagen = ImageDataGenerator(rescale = 1./255,
                                   shear_range = 0.2,
                                   zoom_range = 0.2,
                                   horizontal_flip = True)

test_datagen = ImageDataGenerator(rescale = 1./255)

training_set = train_datagen.flow_from_directory('data2/training_set',
                                                 target_size = (64, 64),
                                                 color_mode = 'rgb',
                                                 batch_size = 64,
                                                 class_mode = 'binary')

test_set = test_datagen.flow_from_directory('data2/test_set',
                                            target_size = (64, 64),
                                            color_mode = 'rgb',
                                            batch_size = 64,
                                            class_mode = 'binary')

classifier_vgg16.fit(training_set,
                     steps_per_epoch = 625, #39987/64
                     epochs = 8,
                     validation_data = test_set,
                     validation_steps = 337) #21532/64

Found 39978 images belonging to 2 classes.
Found 21528 images belonging to 2 classes.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f76b0035700>