In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from collections import Counter
import os
import sys
import re
import subprocess
import MeCab
import glob
import xgboost as xgb
import cv2
from multiprocessing import Pool
import copy
import shelve

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import TheilSenRegressor

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import keras
from keras.layers import Dense, GlobalAveragePooling2D
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.applications.vgg16 import VGG16
from keras.models import Model, Sequential
import tensorflow as tf

Using TensorFlow backend.


In [3]:
tqdm.pandas()
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 150)
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Hiragino Maru Gothic Pro', 'Yu Gothic', 'Meirio', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']

cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
path = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                           shell=True).communicate()[0]).decode('utf-8')
m = MeCab.Tagger("-d {0}".format(path))
m_wakati = MeCab.Tagger("-d {0} -Owakati".format(path))
sf = shelve.open("data/models.db")

In [4]:
filePath = "data/tinder.xlsx"
imagePath = "data/photos"

In [5]:
df = pd.read_excel(filePath)
df.drop_duplicates(inplace=True, subset="id")
df.drop(["Unnamed: 0"],axis=1,inplace=True)

In [6]:
df["bio"] = df["bio"].fillna("")
df["bio_length"] = df["bio"].progress_apply(lambda w:np.log1p(len(str(w))))

def getPhotoNum(_id):
    return len(glob.glob(os.path.join(imagePath,"{0}-*.jpg".format(_id))))

with Pool() as p:
    imap=p.imap(getPhotoNum, df["id"])
    df["photo_num"]=np.asarray(list(tqdm(imap, total=df["id"].shape[0])))

HBox(children=(FloatProgress(value=0.0, max=75341.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75341.0), HTML(value='')))




In [7]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=8000)
df_train = df

In [8]:
bios=[]
for bio in df_train.bio:
    bio=m_wakati.parse(str(bio)).strip()
    bios.append(bio)

trainings = [TaggedDocument(words = data.split(),tags = [i]) for i,data in enumerate(bios)]
doc2vec = Doc2Vec(documents= trainings, vector_size=16, window=5, min_count=3, workers=64, seed=8888)
sf["doc2vec"] = doc2vec

In [9]:
X_train_bio = np.array([doc2vec.docvecs[i] for i in range(df_train.shape[0])])
X_test_bio = np.array([doc2vec.infer_vector(m_wakati.parse(str(bio)).split(" ")) for bio in df_test.bio])

In [10]:
y_train = df_train["match"].values
y_test = df_test["match"].values

In [11]:
def train(X_train, y_train, model, n=5,**kwargs):
    np.random.seed(seed=32)
    models = [copy.deepcopy(model) for i in range(n)]
    labels = np.random.randint(0,n,X_train.shape[0])
    y_pred = np.zeros_like(labels).astype(np.float32)
    for i in range(n):
        model = models[i]
        _X_train = X_train[labels!=i]
        _y_train = y_train[labels!=i]
        _X_test = X_train[labels==i]
        _y_test = y_train[labels==i]
        model.fit(_X_train, _y_train,**kwargs)
        _y_pred = model.predict(_X_test).reshape(-1,)
        print(roc_auc_score(_y_test, _y_pred))
        y_pred[labels==i] = _y_pred
        del _X_train, _y_train, _X_test, _y_test
    return y_pred, models

def predict(X_test, models):
    y_pred = np.zeros(X_test.shape[0])
    n = len(models)
    for model in models:
        y_pred += model.predict(X_test).reshape((-1,))/n
    return y_pred

In [12]:
def getMLPModel(n=16):
    model=Sequential()
    model.add(Dense(64,input_shape=(n,),activation="relu"))
    model.add(Dense(1,activation="linear"))
    return model

In [13]:
y_pred, models=train(X_train_bio, y_train, TheilSenRegressor())
df_train["bio_TheilSen"] = y_pred
df_test["bio_TheilSen"] = predict(X_test_bio, models)
sf["bio_TheilSen"] = models

y_pred, models = train(X_train_bio, y_train, xgb.XGBRegressor(n_jobs=-1))
df_train["bio_xgb"] = y_pred
df_test["bio_xgb"] = predict(X_test_bio, models)
sf["bio_xgb"] = models

y_pred, models = train(X_train_bio, y_train, RandomForestRegressor(n_estimators=1024, n_jobs=-1))
df_train["bio_rf"] = y_pred
df_test["bio_rf"] = predict(X_test_bio, models)
sf["bio_rf"] = models

model = getMLPModel()
model.compile(optimizer=Adam(), loss="mse", metrics=["mse"])
y_pred, models = train(X_train_bio, y_train, model,epochs=10)
df_train["bio_mlp"] = y_pred
df_test["bio_mlp"] = predict(X_test_bio, models)
sf["bio_mlp"] = models

0.5038494336948558
0.5390815454504565
0.6436126996772286
0.5892795952360379
0.5558343073809049
0.6095413108492775
0.5784269629229823
0.586353979950873
0.5665687334936418
0.4831927582447867
0.5495147671723177
0.5582632423274712
0.5668802142114231
0.5474211096642958
0.5421677751124399
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.50357917300167
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6444294011560854
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.5912837540559862
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6596138767934587
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.5744978069523643


In [14]:
def getImage(_id):
    num_photos = int(df[df["id"]==_id]["photo_num"])
    imgs = []
    for i in range(num_photos):
        fileName = _id+"-"+str(i)+".jpg"
        filePath = os.path.join(imagePath, fileName)
        img = cv2.imread(filePath)
        if img is None:
            print(filePath)
            continue
        img = cv2.resize(img, (120,120))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        imgs.append(img)
    return imgs

#写真の読み込み
X_train_images = []
with Pool() as p:
    imap=p.imap(getImage, df_train["id"])
    for photos in list(tqdm(imap, total=df_train.shape[0])):
        for photo in photos:
            X_train_images.append(photo)
X_train_images = np.asarray(X_train_images,dtype=np.float32)/255
    
y_train_image = []
for photo_num, label in zip(df_train["photo_num"], y_train):
    y_train_image += [label]*photo_num
y_train_image = np.array(y_train_image)

HBox(children=(FloatProgress(value=0.0, max=75341.0), HTML(value='')))




In [15]:
def getVGG16BasedModel():
    model = VGG16(weights="imagenet", include_top=False)
    x = model.output
    x = GlobalAveragePooling2D()(x)
    predictions = Dense(1, activation="linear")(x)
    model = Model(inputs=model.input, outputs=predictions)
    for layer in model.layers[:-3]:
        layer.trainable=False
    return model


model = getVGG16BasedModel()
model.compile(optimizer=Adam(), loss="mse", metrics=["mse"])
y_pred, models = train(X_train_images, y_train_image, model, epochs=5)
sf["VGG16_based"] = models

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.5856919607456319
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.502004977744652
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.5470958709173954
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.49594159058720577
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.5369133793209898


In [16]:
s = 0
y_pred_photo_mean = []
y_pred_photo_max = []
for n in tqdm(df_train["photo_num"]):
    if n>0:
        y_pred_photo_mean.append(np.mean(y_pred[s:s+n]))
        y_pred_photo_max.append(np.max(y_pred[s:s+n]))
    else:
        y_pred_photo_mean.append(np.mean(y_pred))
        y_pred_photo_max.append(np.mean(y_pred))
    s+=n
df_train["photo_mean"] = np.array(y_pred_photo_mean)
df_train["photo_max"] = np.array(y_pred_photo_max)

HBox(children=(FloatProgress(value=0.0, max=75341.0), HTML(value='')))




In [17]:
X_test_images = []
with Pool() as p:
    imap=p.imap(getImage, df_test["id"])
    for photos in list(tqdm(imap, total=df_test.shape[0])):
        for photo in photos:
            X_test_images.append(photo)
X_test_images = np.asarray(X_test_images, dtype=np.float32)/255
    
y_pred = predict(X_test_images, models)

s = 0
y_pred_photo_mean = []
y_pred_photo_max = []
for n in tqdm(df_test["photo_num"]):
    if n>0:
        y_pred_photo_mean.append(np.mean(y_pred[s:s+n]))
        y_pred_photo_max.append(np.max(y_pred[s:s+n]))
    else:
        y_pred_photo_mean.append(np.mean(y_pred))
        y_pred_photo_max.append(np.mean(y_pred))
    s+=n
df_test["photo_mean"] = np.array(y_pred_photo_mean)
df_test["photo_max"] = np.array(y_pred_photo_max)

HBox(children=(FloatProgress(value=0.0, max=15069.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15069.0), HTML(value='')))




In [18]:
labels = ["age","photo_num","bio_TheilSen","bio_xgb","bio_length","bio_rf","bio_mlp","photo_mean","photo_max"]
df2 = df_train[labels]
df2_test = df_test[labels]

In [19]:
df2["photo_mean"] = df2["photo_mean"].fillna(df2["photo_mean"].mean())
df2["photo_max"] = df2["photo_max"].fillna(df2["photo_max"].mean())
df2["age"][df2["age"]=="undefined"] = np.mean(df2["age"][df2["age"]!="undefined"])
df2 = df2.astype(np.float32)
df2["age"] = (df2["age"]-18)/10
df2["photo_num"] = df2["photo_num"]/6

df2_test["photo_mean"] = df2_test["photo_mean"].fillna(df2["photo_mean"].mean())
df2_test["photo_max"] = df2_test["photo_max"].fillna(df2["photo_max"].mean())
df2_test["age"][df2_test["age"]=="undefined"] = np.mean(df2["age"][df2["age"]!="undefined"])
df2_test = df2_test.astype(np.float32)
df2_test["age"] = (df2_test["age"]-18)/10
df2_test["photo_num"] = df2_test["photo_num"]/6

In [24]:
y_pred, clf = train(df2, y_train, xgb.XGBRegressor(max_depth=3, n_jobs=-1))
sf["clf_final"] = clf

0.5223074982884732
0.6348033703653728
0.6264735728985877
0.6074538467574286
0.5126755152270448


In [25]:
roc_auc_score(y_train, y_pred)

0.570400152513389