In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

import data
import lightgbm as lgb
import numpy as np
import os
import sys
import pandas as pd
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from gensim.models import word2vec
import logging

from model import lgb_model

%matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 把train和test click拼接到一起，进行词向量训练

In [2]:
train_ad, train_click, train_user, test_ad, test_click = data.load_data()

In [11]:
train_record = pd.merge(train_click, train_ad, on="creative_id")
test_record = pd.merge(test_click, test_ad, on="creative_id")

In [12]:
all_record = pd.concat([train_record, test_record])

In [15]:
grouped_record = all_record.groupby("user_id")

In [16]:
class MySentences(object):
    def __init__(self, grouped_record, column_name):
        self.grouped_record = grouped_record
        self.column_name = column_name
        
    def __iter__(self):
        for user_id, record in grouped_record:
            record = record.sort_values(by="time")
            if self.column_name == "product_id" or self.column_name == "industry":
                p_id = record[self.column_name]
                p_id = p_id[~pd.isnull(p_id)].astype("int")
                sentence = list(map(str, list(p_id)))
            else:
                sentence = list(map(str, list(record[self.column_name])))
            yield sentence

In [None]:
creative_sens = MySentences(grouped_record, "creative_id")
ad_sens = MySentences(grouped_record, "ad_id")
product_sens = MySentences(grouped_record, "product_id")
advertiser_sens = MySentences(grouped_record, "advertiser_id")
industry_sens = MySentences(grouped_record, "industry")

In [None]:
creative_model = word2vec.Word2Vec(creative_sens, min_count=1, size=200, workers=4)
creative_model.wv.save_word2vec_format("checkpoints/creative_model.w2v", binary=True)

In [None]:
ad_model = word2vec.Word2Vec(ad_sens, min_count=1, size=200, workers=4)
ad_model.wv.save_word2vec_format("checkpoints/ad_model.w2v", binary=True)

In [None]:
product_model = word2vec.Word2Vec(product_sens, min_count=1, size=200, workers=4)
product_model.wv.save_word2vec_format("checkpoints/product_model.w2v", binary=True)

In [None]:
advertiser_model = word2vec.Word2Vec(advertiser_sens, min_count=1, size=100, workers=4)
advertiser_model.wv.save_word2vec_format("checkpoints/advertiser_model.w2v", binary=True)

In [None]:
industry_sens = MySentences(grouped_record, "industry")
industry_model = word2vec.Word2Vec(industry_sens, min_count=1, size=100, workers=4)
industry_model.wv.save_word2vec_format("checkpoints/industry_model.w2v", binary=True)

In [None]:
train_ad, train_click, train_user, test_ad, test_click = data.load_data()
train_user, valid_user = train_test_split(train_user, test_size=0.33, random_state=42)
train_record = data.get_part_click(train_click, train_user)
valid_record = data.get_part_click(train_click, valid_user)

# train_record
train_record = pd.merge(train_record, train_ad, on="creative_id")
# valid_record
valid_record = pd.merge(valid_record, train_ad, on="creative_id")

In [None]:
train_features, train_age, train_gender = data.split_feature_target(train_record, keep_user=True)
valid_features, valid_age, valid_gender = data.split_feature_target(valid_record, keep_user=True)

In [None]:
train_features.head()

In [None]:
grouped_record = train_features.groupby("user_id")

In [None]:
class MySentences(object):
    def __init__(self, grouped_record, column_name):
        self.grouped_record = grouped_record
        self.column_name = column_name
        
    def __iter__(self):
        for user_id, record in grouped_record:
            record = record.sort_values(by="time")
            if self.column_name == "product_id" or self.column_name == "industry":
                p_id = record[self.column_name]
                p_id = p_id[~pd.isnull(p_id)].astype("int")
                sentence = list(map(str, list(p_id)))
            else:
                sentence = list(map(str, list(record[self.column_name])))
            yield sentence

In [None]:
creative_sens = MySentences(grouped_record, "creative_id")
ad_sens = MySentences(grouped_record, "ad_id")
product_sens = MySentences(grouped_record, "product_id")
advertiser_sens = MySentences(grouped_record, "advertiser_id")
industry_sens = MySentences(grouped_record, "industry")

In [None]:
creative_model = word2vec.Word2Vec(creative_sens, min_count=1, size=200, workers=4)
creative_model.wv.save_word2vec_format("checkpoints/creative_model.w2v", binary=True)

In [None]:
ad_model = word2vec.Word2Vec(ad_sens, min_count=1, size=200, workers=4)
ad_model.wv.save_word2vec_format("checkpoints/ad_model.w2v", binary=True)

In [None]:
product_model = word2vec.Word2Vec(product_sens, min_count=1, size=200, workers=4)
product_model.wv.save_word2vec_format("checkpoints/product_model.w2v", binary=True)

In [None]:
advertiser_model = word2vec.Word2Vec(advertiser_sens, min_count=1, size=100, workers=4)
advertiser_model.wv.save_word2vec_format("checkpoints/advertiser_model.w2v", binary=True)

In [None]:
industry_sens = MySentences(grouped_record, "industry")
industry_model = word2vec.Word2Vec(industry_sens, min_count=1, size=100, workers=4)
industry_model.wv.save_word2vec_format("checkpoints/industry_model.w2v", binary=True)

In [None]:
industry_model.wv.save_word2vec_format("checkpoints/industry_model.w2v", binary=True)

In [None]:
for user_id, record in grouped_record:
    print(user_id)
    record = record.sort_values(by="time")
    sentence = list(record["creative_id"])
    print(record)
    print(sentence)
    break

In [None]:
#删掉user_id, 把age和gender当成label
'''
features中各列的含义
1. time
2. creative_id
3. click_times
4. ad_id
5. product_id
6. product_category
7. advertiser_id
8. industry_id
'''
train_features, train_age, train_gender = data.split_feature_target(train_record)
valid_features, valid_age, valid_gender = data.split_feature_target(valid_record)

train_features = train_features.values
train_age = train_age.values - 1
train_gender = train_gender.values - 1

valid_features = valid_features.values
valid_age = valid_age.values - 1
valid_gender = valid_gender.values - 1

In [None]:
train_features[:5]

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 引入数据集
raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]

# 切分词汇
sentences= [s.split() for s in raw_sentences]

In [None]:
sentences

In [None]:
# 构建模型
model = word2vec.Word2Vec(sentences, min_count=1)

In [None]:
a = model['dogs']
a.min()
a.max()