<a href="https://colab.research.google.com/github/CharlotteY2003/MIT-Futuremakers/blob/main/7_21%20-%20Kaggle's_Gender_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import *
import matplotlib.pyplot as plt
from keras.preprocessing.image import load_img
from sklearn.model_selection import train_test_split, KFold
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from numpy.random import seed

In [None]:
data = pd.read_csv('age_gender.csv')
data.head()

In [3]:
#Convert each item of pixels into array
def string_to_arr(X, width, height):
  X = X.reset_index(drop=True)
  X = X.apply(lambda x: np.array(x.split(), dtype='float32'))
  #X = np.array(X)/255.0
  #Why can't we add normalization here?
  X = np.array([X[i].reshape(width, height, 1) for i in range(X.shape[0])])
  return X

In [4]:
def data_preprocess(data):
  data = data[data['age'] >= 18]
  data.reset_index(drop=True, inplace=True)
  data = data.dropna()

  num_pixels = len(data['pixels'][0].split())
  dimensions = int(np.sqrt(num_pixels))
  width = dimensions
  height = dimensions
  print(num_pixels, width, height)

  X_img = data.pixels #same as data.iloc[:,4]
  y_gender = data.gender
  y_ethnicity = data.ethnicity
  y_age = data.age

  X_train, X_te, y_train, y_te = train_test_split(X_img, y_gender, test_size = .3, random_state = 11)
  X_val, X_test, y_val, y_test = train_test_split(X_te, y_te, test_size = .15, random_state=11)

  X_train = string_to_arr(X_train, width, height)
  X_test = string_to_arr(X_test, width, height)
  X_val = string_to_arr(X_val, width, height)

  #Why can't we convert entire X instead of splitting it, then converting multiple parts of X?
  print(X_train.shape)

  target_columns = ['age', 'ethnicity', 'gender']
  
  data.drop(labels='img_name', axis=1)
  
  y = data[target_columns]
  X = data.drop(labels=target_columns, axis=1)
  #X (pd dataframe) = has column name while X_img (pd series) does not during initalization

  X = X['pixels'].apply(lambda x: np.array(x.split(), dtype='float32'))
  X = np.array(X)/255.0 #Why don't we add normalization to X_test, X_train, X_val
  X = np.array([ X[i].reshape(48,48,1) for i in range(X.shape[0]) ])

  y_gender = np.array(y['gender']) # To reduce num lines used, couldn't we just do y_gender = np.array(data.gender) instead of making variable y
  y_ethnicity = np.array(y['ethnicity'])
  y_age = np.array(y['age'])

  return X,y_gender, X_train, X_test, X_val, y_train, y_test, y_val

In [5]:
def data_aug(X_val, y_val, X_test, y_test):
  train_data_gen = ImageDataGenerator(rotation_range=30,
                                      width_shift_range = 1,
                                      brightness_range=[.8,1.2],
                                      zoom_range = [.8,1.2],
                                      rescale=1/255)
  val_data_gen = ImageDataGenerator(rescale=1/255)
  test_data_gen = ImageDataGenerator(rescale=1/255)

  np.random.seed(11)

  val_data = val_data_gen.flow(X_val, y_val, shuffle = False, seed= 11)
  test_data = test_data_gen.flow(X_test, y_test, shuffle=False, seed=11)
  
  return val_data, test_data, train_data_gen

In [6]:
def build_model():
  model = Sequential()
  model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', kernel_initializer='he_uniform', input_shape = (48,48,1,)))
  model.add(MaxPooling2D())
  model.add(BatchNormalization())
  
  model.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
  model.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu', kernel_initializer='he_uniform'))
  model.add(MaxPooling2D())
  model.add(Dropout(.3))
  model.add(BatchNormalization())

  model.add(Conv2D(filters=256, kernel_size=(3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
  model.add(Conv2D(filters=256, kernel_size=(3,3), activation='relu', kernel_initializer='he_uniform'))
  model.add(MaxPooling2D())
  model.add(Dropout(.3))
  model.add(BatchNormalization())
  
  model.add(Conv2D(filters=512, kernel_size=(3,3), activation='relu', kernel_initializer='he_uniform', padding='same'))
  model.add(MaxPooling2D())
  model.add(Dropout(.5))
  model.add(BatchNormalization())

  model.add(Flatten())
  model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
  model.add(Dense(128, activation='softmax', kernel_initializer='he_uniform')) #Why is ouput units not 1 or 2?

  #optimizer = SGD(learning_rate=.01, momentum=.9)
  model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy']) #Why do we need sparse categorical instead of binary?
  
  return model

In [7]:
def train_test(X, y_gender, train_data_gen, nsplit=10):
  
  loss_per_fold = list()
  histories = list()
  scores = list()
  kf = KFold(n_splits = nsplit, shuffle=True)

  es = EarlyStopping(min_delta=.01, monitor='val_loss', patience=5, mode='min') #restore_best_weights=True)
  #checkpoint = ModelCheckpoint('weights.h5', mode = 'min', save_best_only=True)
  
  for traini, testi in kf.split(X,y_gender):
    model = build_model()
    np.random.seed(11)
    
    train_data = train_data_gen.flow(X[traini], y_gender[traini], seed=11)
    temp_val_data = train_data_gen.flow(X[testi], y_gender[testi], seed=11)
    history = model.fit(train_data, epochs = 50, batch_size = 32, validation_data = temp_val_data, 
                        verbose = 1, callbacks=es) #steps_per_epoch = 10,
  
    score = model.evaluate(temp_val_data, verbose=0)
    scores.append(score[1])
    loss_per_fold.append(scores[0])
    histories.append(history)

  #model.load_weights('weights.h5')
  
  return scores, histories, loss_per_fold

In [8]:
def evaluate_performance(scores, histories, loss_per_fold):
  for i in range(len(histories)):
    plt.plot(histories[i].history['loss'], label='train', color='blue')
    plt.plot(histories[i].history['val_loss'], label='test', color='orange')
    plt.title('Model ' + str(i+1) + " loss")
    plt.legend()
    plt.show()

    plt.plot(histories[i].history['accuracy'], label = 'train', color='blue')
    plt.plot(histories[i].history['val_accuracy'], label = 'test', color='orange')
    plt.title('Model ' + str(i+1) + " accuracy")
    plt.legend()
    plt.show()

  plt.plot(scores)
  plt.title('Overall scores')
  plt.show()

  print('----------------------------------------------')
  print('Mean scores: ' + str(np.mean(scores)))
  print('Standard deviation of scores: ' + str(np.std(scores)))
  print('----------------------------------------------')
  print('Mean loss: ' + str(np.mean(loss_per_fold)))

In [9]:
def run_test(data):
  X,y_gender, X_train, X_test, X_val, y_train, y_test, y_val = data_preprocess(data)
  #What is the use of X_train or y_test
  
  
  val_data, test_data, train_data_gen = data_aug(X_val, y_val, X_test, y_test)
  
  scores, histories, loss_per_fold = train_test(X, y_gender, train_data_gen) #Why not use X_train and y_train here?
  evaluate_performance(scores, histories)
  
  final_train = np.append(X_train, X_val, axis=0)
  final_val = np.append(y_train, y_val, axis=0) #Shouldn't it be x,y? Variable names are confusing
  final_training_data = train_data_gen.flow(final_train, final_val, seed=11) #What's the point of this if we're not going to use it?

  np.random.seed(11)
  model = build_model()
  
  np.random.seed(11)
  history = model.fit(train_data_gen.flow(X,y_gender, seed=11), epochs = 20, batch_size = 32, validation_data = val_data, 
                        verbose = 1) #steps_per_epoch = 10
                        #confused on this line - final_training_data is never used 
                        
  model.evaluate(test_data)

  y_pred = model.predict_classes(test_data)

  print(classification_report(y_test, y_pred))
  
  cm = confusion_matrix(y_test, y_pred)
  sns.heatmap(cm, cmap = 'Greens', cbar=False, annot=True, fmt='d')

In [None]:
run_test(data)

2304 48 48
(13630, 48, 48, 1)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

In [None]:
import cv2
img = cv2.imread('../input/testset/mind-long.jpg',0)
plt.imshow(img, cmap="gray")
img = cv2.resize(img, (48,48))
img = np.reshape(img,[1,48,48,1])
img_pixels = img.astype("float32") / 255.0
classes = model.predict_classes(img_pixels)

mapper=['male','female']
print(mapper[classes[0]])