In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from collections import Counter
import os
import sys
import re
import subprocess
import MeCab
import glob
import xgboost as xgb
import cv2
from multiprocessing import Pool
import copy

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import SGDRegressor, HuberRegressor, TheilSenRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import keras
from keras.layers import Dense, ReLU, Input, GlobalAveragePooling2D
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.applications.vgg16 import VGG16
from keras.models import Model
import tensorflow as tf

Using TensorFlow backend.


In [3]:
tqdm.pandas()
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 150)
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Hiragino Maru Gothic Pro', 'Yu Gothic', 'Meirio', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']

cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
path = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                           shell=True).communicate()[0]).decode('utf-8')
m = MeCab.Tagger("-d {0}".format(path))
m_wakati = MeCab.Tagger("-d {0} -Owakati".format(path))

In [4]:
filePath = "data/tinder.xlsx"
imagePath = "data/photos"

In [5]:
df = pd.read_excel(filePath)
df.drop_duplicates(inplace=True, subset="id")

In [6]:
df["bio"] = df["bio"].fillna("")

def getPhotoNum(_id):
    return len(glob.glob(os.path.join(imagePath,"{0}-*.jpg".format(_id))))

with Pool() as p:
    imap=p.imap(getPhotoNum, df["id"])
    df["photo_num"]=np.asarray(list(tqdm(imap, total=df["id"].shape[0])))

HBox(children=(FloatProgress(value=0.0, max=10590.0), HTML(value='')))




In [7]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=8000)

In [8]:
bios=[]
for bio in df_train.bio:
    bio=m_wakati.parse(str(bio)).strip()
    bios.append(bio)

trainings = [TaggedDocument(words = data.split(),tags = [i]) for i,data in enumerate(bios)]
doc2vec = Doc2Vec(documents= trainings, dm = 1, vector_size=32, window=4, min_count=1)

In [9]:
X_train_bio = np.array([doc2vec.docvecs[i] for i in range(df_train.shape[0])])
X_test_bio = np.array([doc2vec.infer_vector(m_wakati.parse(str(bio)).split(" ")) for bio in df_test.bio])
y_train = df_train["match"].values
y_test = df_test["match"].values

In [10]:
def train(X_train, y_train, model, n=4,**kwargs):
    models = [copy.deepcopy(model) for i in range(n)]
    y_preds = []
    for i in range(n):
        model = models[i]
        slice1 = (X_train.shape[0]//n)*i
        slice2 = (X_train.shape[0]//n)*(i+1)
        
        _X_train = np.concatenate([X_train[:slice1], X_train[slice2:]])
        _y_train = np.concatenate([y_train[:slice1], y_train[slice2:]])
        _X_test = X_train[slice1:slice2]
        _y_test = y_train[slice1:slice2]
        model.fit(_X_train, _y_train,**kwargs)
        _y_pred = model.predict(_X_test)
        print(roc_auc_score(_y_test, _y_pred))
        y_preds.append(_y_pred)
    y_pred = np.concatenate(y_preds)
    return y_pred, models

def predict(X_test, models):
    y_pred = np.zeros(X_test.shape[0])
    n = len(models)
    for model in models:
        y_pred += model.predict(X_test).reshape((-1,))/n
    return y_pred
    

In [11]:
y_pred, models=train(X_train_bio, y_train, SVR(C=100))
df_train["bio_svr"] = y_pred
df_test["bio_svr"] = predict(X_test_bio, models)

y_pred, models=train(X_train_bio, y_train, xgb.XGBRegressor(n_jobs=-1))
df_train["bio_xgb"] = y_pred
df_test["bio_xgb"] = predict(X_test_bio, models)

0.46947404754713723
0.40435172014119386
0.3798396802688648
0.4435359181731684
0.5690555291884195
0.516648227174543
0.4831732406839703
0.43865960989533775


In [12]:
def getImage(_id):
    num_photos = int(df[df["id"]==_id]["photo_num"])
    imgs = []
    for i in range(num_photos):
        fileName = _id+"-"+str(i)+".jpg"
        filePath = os.path.join(imagePath, fileName)
        img = cv2.imread(filePath)
        if img is None:
            print(filePath)
        img = cv2.resize(img, (120,120))
        imgs.append(img)
    return imgs

#写真の読み込み
X_train_images = []
with Pool() as p:
    imap=p.imap(getImage, df_train["id"])
    for photos in list(tqdm(imap, total=df_train.shape[0])):
        for photo in photos:
            X_train_images.append(photo)
X_train_images = np.asarray(X_train_images)/255
    
y_train_image = []
for photo_num, label in zip(df_train["photo_num"], y_train):
    y_train_image += [label]*photo_num
y_train_image = np.array(y_train_image)


X_test_images = []
with Pool() as p:
    imap=p.imap(getImage, df_test["id"])
    for photos in list(tqdm(imap, total=df_test.shape[0])):
        for photo in photos:
            X_test_images.append(photo)
X_test_images = np.asarray(X_test_images)/255

HBox(children=(FloatProgress(value=0.0, max=8472.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2118.0), HTML(value='')))




In [13]:
def getVGG16BasedModel():
    model = VGG16(weights="imagenet", include_top=False)
    x = model.output
    x = GlobalAveragePooling2D()(x)
    predictions = Dense(1, activation="linear")(x)
    model = Model(inputs=model.input, outputs=predictions)
    for layer in model.layers[:-3]:
        layer.trainable=False
    return model


model = getVGG16BasedModel()
model.compile(optimizer=Adam(), loss="mse", metrics=["mse"])
y_pred, models = train(X_train_images, y_train_image, model, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.5794453552671798
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.547918288884563
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.5838020407437694
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.5200201966128091


In [14]:
s = 0
y_pred_photo = []
for n in df_train["photo_num"]:
    y_pred_photo.append(np.mean(y_pred[s:s+n]))
    s+=n
y_pred_photo = np.array(y_pred_photo)
df_train["photo"] = y_pred_photo

In [15]:
X_test_images = []
with Pool() as p:
    imap=p.imap(getImage, df_test["id"])
    for photos in list(tqdm(imap, total=df_test.shape[0])):
        for photo in photos:
            X_test_images.append(photo)
X_test_images = np.asarray(X_test_images)/255
    
y_pred = predict(X_test_images, models)

s = 0
y_pred_photo = []
for n in df_test["photo_num"]:
    y_pred_photo.append(np.mean(y_pred[s:s+n]))
    s+=n
y_pred_photo = np.array(y_pred_photo)
df_test["photo"] = y_pred_photo

HBox(children=(FloatProgress(value=0.0, max=2118.0), HTML(value='')))




In [16]:
df2=df_train[["age","distance_mi","photo_num","bio_svr","bio_xgb","photo"]]
df2_test=df_test[["age","distance_mi","photo_num","bio_svr","bio_xgb","photo"]]

In [17]:
df2["photo"]=df2["photo"].fillna(df2["photo"].mean())
df2["distance_mi"][df2["distance_mi"]=="undefined"]=np.mean(df2["distance_mi"][df2["distance_mi"]!="undefined"])
df2["age"][df2["age"]=="undefined"]=np.mean(df2["age"][df2["age"]!="undefined"])
df2=df2.astype(np.float32)
df2_test["photo"]=df2_test["photo"].fillna(df2["photo"].mean())
df2_test["distance_mi"][df2_test["distance_mi"]=="undefined"]=np.mean(df2["distance_mi"][df2["distance_mi"]!="undefined"])
df2_test["age"][df2_test["age"]=="undefined"]=np.mean(df2["age"][df2["age"]!="undefined"])
df2_test=df2_test.astype(np.float32)

In [20]:
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1)),
    ('svr', make_pipeline(StandardScaler(),
                          SVR())),
    ("xgb",xgb.XGBRegressor(n_jobs=-1)),
    ("ada",AdaBoostRegressor()),
    ("bag",BaggingRegressor()),
    ("gauss",GaussianProcessRegressor()),
    ("Sgd",SGDRegressor()),
    ("Huber",HuberRegressor()),
    ("TheilSen",TheilSenRegressor())
]
clf = StackingRegressor(
    estimators=estimators, final_estimator=xgb.XGBRegressor()
)

y_pred=clf.fit(df2, y_train).predict(df2_test)



In [21]:
roc_auc_score(y_test, y_pred)

0.6672531304485656