In [35]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

import data
import lightgbm as lgb
import numpy as np
import os
import sys
import pandas as pd
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from gensim.models import word2vec
import logging

from model import lgb_model

%matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
train_ad, train_click, train_user, test_ad, test_click = data.load_data()
train_user, valid_user = train_test_split(train_user, test_size=0.33, random_state=42)
train_record = data.get_part_click(train_click, train_user)
valid_record = data.get_part_click(train_click, valid_user)

# train_record
train_record = pd.merge(train_record, train_ad, on="creative_id")
# valid_record
valid_record = pd.merge(valid_record, train_ad, on="creative_id")

In [3]:
train_features, train_age, train_gender = data.split_feature_target(train_record, keep_user=True)
valid_features, valid_age, valid_gender = data.split_feature_target(valid_record, keep_user=True)

In [4]:
train_features.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,6,309204,325532,1,292523,27081.0,3,32066,242.0
1,5,801758,325532,1,292523,27081.0,3,32066,242.0
2,5,126648,325532,1,292523,27081.0,3,32066,242.0
3,59,309204,2746730,1,2362208,,18,14682,88.0
4,68,317233,2746730,1,2362208,,18,14682,88.0


In [5]:
grouped_record = train_features.groupby("user_id")

In [31]:
class MySentences(object):
    def __init__(self, grouped_record, column_name):
        self.grouped_record = grouped_record
        self.column_name = column_name
        
    def __iter__(self):
        for user_id, record in grouped_record:
            record = record.sort_values(by="time")
            if self.column_name == "product_id" or self.column_name == "industry":
                p_id = record[self.column_name]
                p_id = p_id[~pd.isnull(p_id)].astype("int")
                sentence = list(map(str, list(p_id)))
            else:
                sentence = list(map(str, list(record[self.column_name])))
            yield sentence

In [32]:
creative_sens = MySentences(grouped_record, "creative_id")
ad_sens = MySentences(grouped_record, "ad_id")
product_sens = MySentences(grouped_record, "product_id")
advertiser_sens = MySentences(grouped_record, "advertiser_id")
# industry_sens = MySentences(grouped_record, "industry")

In [36]:
creative_model = word2vec.Word2Vec(creative_sens, min_count=1, size=200, workers=4)
creative_model.wv.save_word2vec_format("checkpoints/creative_model.w2v", binary=True)

2020-05-31 04:18:32,186 : INFO : collecting all words and their counts
2020-05-31 04:18:38,999 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-05-31 04:18:44,272 : INFO : PROGRESS: at sentence #10000, processed 333194 words, keeping 150020 word types
2020-05-31 04:18:49,676 : INFO : PROGRESS: at sentence #20000, processed 671088 words, keeping 251747 word types
2020-05-31 04:18:55,012 : INFO : PROGRESS: at sentence #30000, processed 1005460 words, keeping 336019 word types
2020-05-31 04:19:00,290 : INFO : PROGRESS: at sentence #40000, processed 1342094 words, keeping 412046 word types
2020-05-31 04:19:05,562 : INFO : PROGRESS: at sentence #50000, processed 1678855 words, keeping 480241 word types
2020-05-31 04:19:10,950 : INFO : PROGRESS: at sentence #60000, processed 2016766 words, keeping 542489 word types
2020-05-31 04:19:16,246 : INFO : PROGRESS: at sentence #70000, processed 2352157 words, keeping 599654 word types
2020-05-31 04:19:21,612 : INFO : P

In [37]:
ad_model = word2vec.Word2Vec(ad_sens, min_count=1, size=200, workers=4)
ad_model.wv.save_word2vec_format("checkpoints/ad_model.w2v", binary=True)

2020-05-31 04:57:37,145 : INFO : collecting all words and their counts
2020-05-31 04:57:43,172 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-05-31 04:57:48,141 : INFO : PROGRESS: at sentence #10000, processed 333194 words, keeping 146942 word types
2020-05-31 04:57:53,123 : INFO : PROGRESS: at sentence #20000, processed 671088 words, keeping 245365 word types
2020-05-31 04:57:58,134 : INFO : PROGRESS: at sentence #30000, processed 1005460 words, keeping 326586 word types
2020-05-31 04:58:03,126 : INFO : PROGRESS: at sentence #40000, processed 1342094 words, keeping 399420 word types
2020-05-31 04:58:08,150 : INFO : PROGRESS: at sentence #50000, processed 1678855 words, keeping 464293 word types
2020-05-31 04:58:13,153 : INFO : PROGRESS: at sentence #60000, processed 2016766 words, keeping 523457 word types
2020-05-31 04:58:18,165 : INFO : PROGRESS: at sentence #70000, processed 2352157 words, keeping 577690 word types
2020-05-31 04:58:23,238 : INFO : P

In [38]:
product_model = word2vec.Word2Vec(product_sens, min_count=1, size=200, workers=4)
product_model.wv.save_word2vec_format("checkpoints/product_model.w2v", binary=True)

2020-05-31 05:35:36,012 : INFO : collecting all words and their counts
2020-05-31 05:35:41,564 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-05-31 05:35:51,524 : INFO : PROGRESS: at sentence #10000, processed 192550 words, keeping 6439 word types
2020-05-31 05:36:01,436 : INFO : PROGRESS: at sentence #20000, processed 388925 words, keeping 8641 word types
2020-05-31 05:36:11,356 : INFO : PROGRESS: at sentence #30000, processed 584073 words, keeping 10339 word types
2020-05-31 05:36:21,351 : INFO : PROGRESS: at sentence #40000, processed 780431 words, keeping 11694 word types
2020-05-31 05:36:31,324 : INFO : PROGRESS: at sentence #50000, processed 975160 words, keeping 12787 word types
2020-05-31 05:36:41,305 : INFO : PROGRESS: at sentence #60000, processed 1171911 words, keeping 13738 word types
2020-05-31 05:36:51,271 : INFO : PROGRESS: at sentence #70000, processed 1368322 words, keeping 14638 word types
2020-05-31 05:37:01,263 : INFO : PROGRESS: at 

In [39]:
advertiser_model = word2vec.Word2Vec(advertiser_sens, min_count=1, size=100, workers=4)
advertiser_model.wv.save_word2vec_format("checkpoints/advertiser_model.w2v", binary=True)

2020-05-31 06:37:28,043 : INFO : collecting all words and their counts
2020-05-31 06:37:33,419 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-05-31 06:37:38,353 : INFO : PROGRESS: at sentence #10000, processed 333194 words, keeping 16302 word types
2020-05-31 06:37:43,337 : INFO : PROGRESS: at sentence #20000, processed 671088 words, keeping 20419 word types
2020-05-31 06:37:48,290 : INFO : PROGRESS: at sentence #30000, processed 1005460 words, keeping 23289 word types
2020-05-31 06:37:53,232 : INFO : PROGRESS: at sentence #40000, processed 1342094 words, keeping 25404 word types
2020-05-31 06:37:58,205 : INFO : PROGRESS: at sentence #50000, processed 1678855 words, keeping 27104 word types
2020-05-31 06:38:03,146 : INFO : PROGRESS: at sentence #60000, processed 2016766 words, keeping 28532 word types
2020-05-31 06:38:08,111 : INFO : PROGRESS: at sentence #70000, processed 2352157 words, keeping 29777 word types
2020-05-31 06:38:13,093 : INFO : PROGRESS

In [None]:
industry_sens = MySentences(grouped_record, "industry")
industry_model = word2vec.Word2Vec(industry_sens, min_count=1, size=100, workers=4)
industry_model.wv.save_word2vec_format("checkpoints/industry_model.w2v", binary=True)

In [20]:
for user_id, record in grouped_record:
    print(user_id)
    record = record.sort_values(by="time")
    sentence = list(record["creative_id"])
    print(record)
    print(sentence)
    break

2
          time  user_id  creative_id  click_times    ad_id  product_id  \
3398990     10        2        63441            1    58788        87.0   
18079728    11        2       155822            1   139702        80.0   
1617395     14        2        39714            1    38066       129.0   
3529924     17        2       609050            1   541125       129.0   
272400      28        2        13069            1    14495      1400.0   
1242952     28        2      1266180            1  1107111         NaN   
2241262     28        2       441462            1   392680        87.0   
7302729     38        2      1657530            1  1436687      1261.0   
1861396     38        2      1696925            1  1469873         NaN   
7291835     39        2       769749            1   680028         NaN   
9265604     41        2      1074235            1   942883       111.0   
4230871     42        2      1252062            1  1095314       129.0   
6261336     42        2      1662244

In [None]:
def word_embedding(intput_record):
    

In [15]:
#删掉user_id, 把age和gender当成label
'''
features中各列的含义
1. time
2. creative_id
3. click_times
4. ad_id
5. product_id
6. product_category
7. advertiser_id
8. industry_id
'''
train_features, train_age, train_gender = data.split_feature_target(train_record)
valid_features, valid_age, valid_gender = data.split_feature_target(valid_record)

train_features = train_features.values
train_age = train_age.values - 1
train_gender = train_gender.values - 1

valid_features = valid_features.values
valid_age = valid_age.values - 1
valid_gender = valid_gender.values - 1

In [16]:
train_features[:5]

array([[6.000000e+00, 3.255320e+05, 1.000000e+00, 2.925230e+05,
        2.708100e+04, 3.000000e+00, 3.206600e+04, 2.420000e+02],
       [5.000000e+00, 3.255320e+05, 1.000000e+00, 2.925230e+05,
        2.708100e+04, 3.000000e+00, 3.206600e+04, 2.420000e+02],
       [5.000000e+00, 3.255320e+05, 1.000000e+00, 2.925230e+05,
        2.708100e+04, 3.000000e+00, 3.206600e+04, 2.420000e+02],
       [5.900000e+01, 2.746730e+06, 1.000000e+00, 2.362208e+06,
                 nan, 1.800000e+01, 1.468200e+04, 8.800000e+01],
       [6.800000e+01, 2.746730e+06, 1.000000e+00, 2.362208e+06,
                 nan, 1.800000e+01, 1.468200e+04, 8.800000e+01]])

In [21]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 引入数据集
raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]

# 切分词汇
sentences= [s.split() for s in raw_sentences]

In [22]:
sentences

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dogs'],
 ['yoyoyo', 'you', 'go', 'home', 'now', 'to', 'sleep']]

In [23]:
# 构建模型
model = word2vec.Word2Vec(sentences, min_count=1)

2020-05-30 15:16:37,285 : INFO : collecting all words and their counts
2020-05-30 15:16:37,286 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-05-30 15:16:37,286 : INFO : collected 15 word types from a corpus of 16 raw words and 2 sentences
2020-05-30 15:16:37,287 : INFO : Loading a fresh vocabulary
2020-05-30 15:16:37,287 : INFO : effective_min_count=1 retains 15 unique words (100% of original 15, drops 0)
2020-05-30 15:16:37,288 : INFO : effective_min_count=1 leaves 16 word corpus (100% of original 16, drops 0)
2020-05-30 15:16:37,288 : INFO : deleting the raw counts dictionary of 15 items
2020-05-30 15:16:37,289 : INFO : sample=0.001 downsamples 15 most-common words
2020-05-30 15:16:37,289 : INFO : downsampling leaves estimated 2 word corpus (13.7% of prior 16)
2020-05-30 15:16:37,290 : INFO : estimated required memory for 15 words and 100 dimensions: 19500 bytes
2020-05-30 15:16:37,290 : INFO : resetting layer weights
2020-05-30 15:16:37,293 : INFO :

In [20]:
a = model['dogs']
a.min()
a.max()

  """Entry point for launching an IPython kernel.


-0.004990018

0.0048909504