In [None]:
# http://www.imooc.com/article/43784

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

import data
import lightgbm as lgb
import numpy as np
import os
import sys
import pandas as pd
import matplotlib
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from gensim.models import word2vec, keyedvectors
import logging

from model import lgb_model

%matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
creative_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/creative_model.w2v", binary=True)
ad_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/ad_model.w2v", binary=True)
product_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/product_model.w2v", binary=True)
advertiser_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/advertiser_model.w2v", binary=True)
industry_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/industry_model.w2v", binary=True)

2020-06-03 02:53:37,558 : INFO : loading projection weights from checkpoints/creative_model.w2v
2020-06-03 02:54:05,066 : INFO : loaded (3412772, 200) matrix from checkpoints/creative_model.w2v
2020-06-03 02:54:05,067 : INFO : loading projection weights from checkpoints/ad_model.w2v
2020-06-03 02:54:30,774 : INFO : loaded (3027360, 200) matrix from checkpoints/ad_model.w2v
2020-06-03 02:54:30,775 : INFO : loading projection weights from checkpoints/product_model.w2v
2020-06-03 02:54:31,500 : INFO : loaded (39056, 200) matrix from checkpoints/product_model.w2v
2020-06-03 02:54:31,501 : INFO : loading projection weights from checkpoints/advertiser_model.w2v
2020-06-03 02:54:31,919 : INFO : loaded (57870, 100) matrix from checkpoints/advertiser_model.w2v
2020-06-03 02:54:31,920 : INFO : loading projection weights from checkpoints/industry_model.w2v
2020-06-03 02:54:32,060 : INFO : loaded (331, 100) matrix from checkpoints/industry_model.w2v


In [3]:
train_ad, train_click, train_user, test_ad, test_click = data.load_data()
# train_user, valid_user = train_test_split(train_user, test_size=0.33, random_state=42)
# train_record = data.get_part_click(train_click, train_user)
# valid_record = data.get_part_click(train_click, valid_user)

# train_record
train_record = pd.merge(train_click, train_ad, on="creative_id")
# test_record
test_record = pd.merge(test_click, test_ad, on="creative_id")

In [54]:
train_record.groupby("user_id").count()

Unnamed: 0_level_0,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,13,13,13,13,6,13,13,12
2,45,45,45,45,31,45,45,45
3,30,30,30,30,20,30,30,22
4,29,29,29,29,23,29,29,20
5,33,33,33,33,11,33,33,32
...,...,...,...,...,...,...,...,...
899996,14,14,14,14,5,14,14,14
899997,18,18,18,18,15,18,18,18
899998,14,14,14,14,4,14,14,11
899999,22,22,22,22,7,22,22,20


In [4]:
# train_features, train_age, train_gender = data.split_feature_target(train_record, keep_user=True)
# valid_features, valid_age, valid_gender = data.split_feature_target(train_record, keep_user=True)

In [5]:
sample_record = train_record[:1000]
sample_record

# TODO train embedding
train_grouped = sample_record.groupby("user_id")

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,9,30920,567330,1,504423,30673.0,3,32638,319.0
1,15,320815,567330,1,504423,30673.0,3,32638,319.0
2,11,355089,567330,1,504423,30673.0,3,32638,319.0
3,9,363442,567330,1,504423,30673.0,3,32638,319.0
4,14,370513,567330,1,504423,30673.0,3,32638,319.0
...,...,...,...,...,...,...,...,...,...
995,61,325322,2361327,1,2035918,1261.0,2,6783,6.0
996,57,325322,2361327,1,2035918,1261.0,2,6783,6.0
997,69,32552,2361327,1,2035918,1261.0,2,6783,6.0
998,53,32552,2361327,1,2035918,1261.0,2,6783,6.0


In [6]:
def get_embedding_from_grouped(user_id, records, column_name, keep_uid=False):
    if column_name == "ad_id":
        model = ad_model
    elif column_name == "creative_id":
        model = creative_model
    elif column_name == "industry":
        model = industry_model
    elif column_name == "product_id":
        model = product_model
    elif column_name == "advertiser_id":
        model = advertiser_model
    
    if column_name == "industry":
        embedding = records[column_name].apply(lambda x: np.zeros(100, ) if pd.isnull(x) else model[str(int(x))]).apply(pd.Series)
    elif column_name == "product_id":
        embedding = records[column_name].apply(lambda x: np.zeros(200, ) if pd.isnull(x) else model[str(int(x))]).apply(pd.Series)
    else:
        embedding = records[column_name].apply(lambda x: model[str(x)]).apply(pd.Series)
    embedding = embedding.mean()
    
    if keep_uid:
        embedding.insert(0, "user_id", user_id)
    return embedding

In [58]:
def total_embed(grouped, data_type="train"):
    id = 1
    flag = 0
    if data_type == "train":
        f = open("embed/train/train_embedding{}.csv".format(id), "w")
    else:
        f = open("embed/test/test_embedding{}.csv".format(id), "w")
    for user_id, records in tqdm(grouped):
        records = records.sort_values(by="time")

        # ad_embedding
        ad_embedding = get_embedding_from_grouped(user_id, records, column_name="ad_id")
        #creative_embedding
        creative_embedding = get_embedding_from_grouped(user_id, records, column_name="creative_id")
        #product_embedding
        product_embedding = get_embedding_from_grouped(user_id, records, column_name="product_id")
        #advertiser_embedding
        advertiser_embedding = get_embedding_from_grouped(user_id, records, column_name="advertiser_id")
        #industry_embedding
        industry_embedding = get_embedding_from_grouped(user_id, records, column_name="industry")

        embed_features = np.concatenate([ad_embedding, creative_embedding, product_embedding, advertiser_embedding, industry_embedding])
        '''
        左开右闭，下标从0开始
        0: userid
        [1:201]: ad_embedding
        [201:401]: creative_embedding
        [401:601]: product_embedding
        [601:701]: advertiser_embedding
        [701:801]: industry_embedding
        '''
        f.write(str(user_id) + ' ' + str(list(embed_features))[1:-1] + '\n')

        flag += 1
        if flag % 100 == 0:
            f.close()
            id += 1
            if data_type == "train":
                f = open("embed/train/train_embedding{}.csv".format(id), "w")
            else:
                f = open("embed/test/test_embedding{}.csv".format(id), "w")
    f.close()

In [None]:
total_embed(train_grouped, data_type="train")

In [None]:
column_names = ["creative_id", "ad_id", "product_id", "advertiser_id", "industry"]
w2v_models = [creative_model, ad_model, product_model, advertiser_model, industry_model]

def transform_dataframe(train_features, column_names, w2v_models):
    for column_name, w2v_model in zip(column_names, w2v_models):
        print(column_name, "START")
        if column_name == "industry":
            embedding_df = train_features[column_name].apply(lambda x: np.zeros(100, ) if pd.isnull(x) else w2v_model[str(int(x))]).apply(pd.Series)
        elif column_name == "product_id":
            embedding_df = train_features[column_name].apply(lambda x: np.zeros(200, ) if pd.isnull(x) else w2v_model[str(int(x))]).apply(pd.Series)
        else:
            embedding_df = train_features[column_name].apply(lambda x: w2v_model[str(x)]).apply(pd.Series)
        train_features = pd.concat([train_features, embedding_df], axis=1).drop(column_name, axis=1)
        print(column_name, "FINISH")
    train_features.to_csv("main_features.csv", index=False)
    print("FINISH save csv!")

In [None]:
train_features = train_features.values
train_age = train_age.values - 1
train_gender = train_gender.values - 1

valid_features = valid_features.values
valid_age = valid_age.values - 1
valid_gender = valid_gender.values - 1

In [None]:
lgb_traindata_gender = lgb.Dataset(train_features, train_gender)
lgb_traindata_age = lgb.Dataset(train_features, train_age)

lgb_valdata_gender = lgb.Dataset(valid_features, valid_gender, reference=lgb_traindata_gender)
lgb_valdata_age = lgb.Dataset(valid_features, valid_age, reference=lgb_traindata_age)

# 性别模型的预测

In [None]:
gender_model = lgb_model(model_kind="gender")
gender_model.train(lgb_traindata_gender, lgb_valdata_gender)
gender_model.save_model()

In [None]:
test_record = pd.merge(test_click, test_ad, on="creative_id")

In [None]:
test_features = test_record.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8]]
test_features = test_features.values

In [None]:
test_pred = gender_model.predict(test_features)

In [None]:
test_pred