In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from collections import Counter
import os
import sys
import re
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from multiprocessing import Pool

import keras
from keras.layers import Dense, ReLU, Input, GlobalAveragePooling2D
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.applications.vgg16 import VGG16
from keras.models import Model
import tensorflow as tf

Using TensorFlow backend.


In [3]:
tqdm.pandas()
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 150)
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Hiragino Maru Gothic Pro', 'Yu Gothic', 'Meirio', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']

In [4]:
filePath = "data/tinder.xlsx"
imagePath = "data/photos"

In [5]:
df = pd.read_excel(filePath)
df.drop_duplicates(inplace=True, subset="id")
df.set_index("id", inplace=True)
df["bio"] = df["bio"].fillna("")

In [6]:
X=[]
y=[]
for fileName in tqdm(os.listdir(imagePath)):
    try:
        id_ = re.match("([a-z0-9]*)-\d( \(\d\))?.jpg",fileName).group(1)
        match = df.loc[id_]["match"]
        filePath = os.path.join(imagePath, fileName)
        img = cv2.imread(filePath)
        img = cv2.resize(img, (120,120))
        X.append(img)
        y.append(match)
    except:
        pass
X=np.asarray(X)
y=np.asarray(y)

HBox(children=(FloatProgress(value=0.0, max=26455.0), HTML(value='')))




In [7]:
X = (X/255).astype(np.float32)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8888)

In [9]:
def getModel():
    model = VGG16(weights="imagenet", include_top=False)
    x = model.output
    x = GlobalAveragePooling2D()(x)
    predictions = Dense(1, activation="linear")(x)
    model = Model(inputs=model.input, outputs=predictions)
    for layer in model.layers[:-3]:
        layer.trainable=False
    return model

In [10]:
model = getModel()

In [11]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0   

In [12]:
model.compile(optimizer=Adam(), loss="mse", metrics=["mse"])

In [13]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Train on 19473 samples, validate on 6491 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f05d1808990>

In [14]:
y_pred=model.predict(X_test)

In [15]:
roc_auc_score(y_test,y_pred)

0.6025131864722308