In [None]:
# http://www.imooc.com/article/43784

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

import data
import lightgbm as lgb
import numpy as np
import os
import sys
import re
import pandas as pd
import matplotlib
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from gensim.models import word2vec, keyedvectors
import logging

from model import lgb_model

%matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
creative_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/creative_model.w2v", binary=True)
ad_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/ad_model.w2v", binary=True)
product_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/product_model.w2v", binary=True)
advertiser_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/advertiser_model.w2v", binary=True)
industry_model = keyedvectors.KeyedVectors.load_word2vec_format("checkpoints/industry_model.w2v", binary=True)

2020-06-03 02:53:37,558 : INFO : loading projection weights from checkpoints/creative_model.w2v
2020-06-03 02:54:05,066 : INFO : loaded (3412772, 200) matrix from checkpoints/creative_model.w2v
2020-06-03 02:54:05,067 : INFO : loading projection weights from checkpoints/ad_model.w2v
2020-06-03 02:54:30,774 : INFO : loaded (3027360, 200) matrix from checkpoints/ad_model.w2v
2020-06-03 02:54:30,775 : INFO : loading projection weights from checkpoints/product_model.w2v
2020-06-03 02:54:31,500 : INFO : loaded (39056, 200) matrix from checkpoints/product_model.w2v
2020-06-03 02:54:31,501 : INFO : loading projection weights from checkpoints/advertiser_model.w2v
2020-06-03 02:54:31,919 : INFO : loaded (57870, 100) matrix from checkpoints/advertiser_model.w2v
2020-06-03 02:54:31,920 : INFO : loading projection weights from checkpoints/industry_model.w2v
2020-06-03 02:54:32,060 : INFO : loaded (331, 100) matrix from checkpoints/industry_model.w2v


In [3]:
train_ad, train_click, train_user, test_ad, test_click = data.load_data()
# train_user, valid_user = train_test_split(train_user, test_size=0.33, random_state=42)
# train_record = data.get_part_click(train_click, train_user)
# valid_record = data.get_part_click(train_click, valid_user)

# train_record
train_record = pd.merge(train_click, train_ad, on="creative_id")
# test_record
test_record = pd.merge(test_click, test_ad, on="creative_id")

In [54]:
train_record.groupby("user_id").count()

Unnamed: 0_level_0,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,13,13,13,13,6,13,13,12
2,45,45,45,45,31,45,45,45
3,30,30,30,30,20,30,30,22
4,29,29,29,29,23,29,29,20
5,33,33,33,33,11,33,33,32
...,...,...,...,...,...,...,...,...
899996,14,14,14,14,5,14,14,14
899997,18,18,18,18,15,18,18,18
899998,14,14,14,14,4,14,14,11
899999,22,22,22,22,7,22,22,20


In [4]:
# train_features, train_age, train_gender = data.split_feature_target(train_record, keep_user=True)
# valid_features, valid_age, valid_gender = data.split_feature_target(train_record, keep_user=True)

In [5]:
sample_record = train_record[:1000]
sample_record

# TODO train embedding
train_grouped = sample_record.groupby("user_id")

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,9,30920,567330,1,504423,30673.0,3,32638,319.0
1,15,320815,567330,1,504423,30673.0,3,32638,319.0
2,11,355089,567330,1,504423,30673.0,3,32638,319.0
3,9,363442,567330,1,504423,30673.0,3,32638,319.0
4,14,370513,567330,1,504423,30673.0,3,32638,319.0
...,...,...,...,...,...,...,...,...,...
995,61,325322,2361327,1,2035918,1261.0,2,6783,6.0
996,57,325322,2361327,1,2035918,1261.0,2,6783,6.0
997,69,32552,2361327,1,2035918,1261.0,2,6783,6.0
998,53,32552,2361327,1,2035918,1261.0,2,6783,6.0


In [6]:
def get_embedding_from_grouped(user_id, records, column_name, keep_uid=False):
    if column_name == "ad_id":
        model = ad_model
    elif column_name == "creative_id":
        model = creative_model
    elif column_name == "industry":
        model = industry_model
    elif column_name == "product_id":
        model = product_model
    elif column_name == "advertiser_id":
        model = advertiser_model
    
    if column_name == "industry":
        embedding = records[column_name].apply(lambda x: np.zeros(100, ) if pd.isnull(x) else model[str(int(x))]).apply(pd.Series)
    elif column_name == "product_id":
        embedding = records[column_name].apply(lambda x: np.zeros(200, ) if pd.isnull(x) else model[str(int(x))]).apply(pd.Series)
    else:
        embedding = records[column_name].apply(lambda x: model[str(x)]).apply(pd.Series)
    embedding = embedding.mean()
    
    if keep_uid:
        embedding.insert(0, "user_id", user_id)
    return embedding

In [58]:
def total_embed(grouped, data_type="train"):
    id = 1
    flag = 0
    if data_type == "train":
        f = open("embed/train/train_embedding{}.csv".format(id), "w")
    else:
        f = open("embed/test/test_embedding{}.csv".format(id), "w")
    for user_id, records in tqdm(grouped):
        records = records.sort_values(by="time")

        # ad_embedding
        ad_embedding = get_embedding_from_grouped(user_id, records, column_name="ad_id")
        #creative_embedding
        creative_embedding = get_embedding_from_grouped(user_id, records, column_name="creative_id")
        #product_embedding
        product_embedding = get_embedding_from_grouped(user_id, records, column_name="product_id")
        #advertiser_embedding
        advertiser_embedding = get_embedding_from_grouped(user_id, records, column_name="advertiser_id")
        #industry_embedding
        industry_embedding = get_embedding_from_grouped(user_id, records, column_name="industry")

        embed_features = np.concatenate([ad_embedding, creative_embedding, product_embedding, advertiser_embedding, industry_embedding])
        '''
        左开右闭，下标从0开始
        0: userid
        [1:201]: ad_embedding
        [201:401]: creative_embedding
        [401:601]: product_embedding
        [601:701]: advertiser_embedding
        [701:801]: industry_embedding
        '''
        f.write(str(user_id) + ', ' + str(list(embed_features))[1:-1] + '\n')

        flag += 1
        if flag % 100 == 0:
            f.close()
            id += 1
            if data_type == "train":
                f = open("embed/train/train_embedding{}.csv".format(id), "w")
            else:
                f = open("embed/test/test_embedding{}.csv".format(id), "w")
    f.close()

In [None]:
total_embed(train_grouped, data_type="train")

In [32]:
def place_zero(value):
    matched = value.group()
    return matched[0] + ", " + matched[2]

In [12]:
root_path = "embed/test/"

files = (os.listdir(root_path))
if ".ipynb_checkpoints" in files:
    files.remove(".ipynb_checkpoints")
if "test_embedding_all.csv" in files:
    files.remove("test_embedding_all.csv")
files = [root_path + i for i in files]

with open(root_path + "test_embedding_all.csv", "w") as ff:
    for ii, file in enumerate(files):
        print(ii)
        f = open(file, "r")
        persons = f.readlines()
        for p in tqdm(persons):
            p1 = re.sub(r'\d\s\d', place_zero, p)
            t = ff.write(p1)
        f.close()

0


100%|██████████| 45000/45000 [00:22<00:00, 1957.82it/s]


1


100%|██████████| 45000/45000 [00:22<00:00, 1980.31it/s]


2


100%|██████████| 45000/45000 [00:22<00:00, 2006.73it/s]


3


100%|██████████| 45000/45000 [00:22<00:00, 2039.23it/s]


4


100%|██████████| 45000/45000 [00:22<00:00, 2009.51it/s]


5


100%|██████████| 45000/45000 [00:22<00:00, 2014.63it/s]


6


100%|██████████| 45000/45000 [00:22<00:00, 2016.28it/s]


7


100%|██████████| 45000/45000 [00:22<00:00, 2025.34it/s]


8


100%|██████████| 45000/45000 [00:22<00:00, 2032.74it/s]


9


100%|██████████| 45000/45000 [00:22<00:00, 2006.88it/s]


10


100%|██████████| 45000/45000 [00:22<00:00, 2023.38it/s]


11


100%|██████████| 45000/45000 [00:22<00:00, 1996.56it/s]


12


100%|██████████| 45000/45000 [00:22<00:00, 2009.72it/s]


13


100%|██████████| 45000/45000 [00:22<00:00, 2017.91it/s]


14


100%|██████████| 45000/45000 [00:22<00:00, 2018.59it/s]


15


100%|██████████| 45000/45000 [00:22<00:00, 2011.08it/s]


16


100%|██████████| 45000/45000 [00:22<00:00, 1999.76it/s]


17


100%|██████████| 45000/45000 [00:22<00:00, 2000.01it/s]


18


100%|██████████| 10000/10000 [00:04<00:00, 2001.23it/s]


19


100%|██████████| 45000/45000 [00:23<00:00, 1926.05it/s]


20


100%|██████████| 45000/45000 [00:22<00:00, 2012.33it/s]


21


100%|██████████| 45000/45000 [00:22<00:00, 1999.56it/s]


22


100%|██████████| 45000/45000 [00:21<00:00, 2134.02it/s]


In [33]:
a = re.search(r'\d\s-', p)
print(len(a.group()))

3


In [34]:
# root_path = "embed/train/"
# i_file = root_path + "train_embedding_all.csv"
# o_file = root_path + "train_embedding_all_1.csv"

root_path = "embed/test/"
i_file = root_path + "test_embedding_all.csv"
o_file = root_path + "test_embedding_all_1.csv"

with open(o_file, "w") as ff:
    f = open(i_file, "r")
    persons = f.readlines()
    for p in tqdm(persons):
        if re.search(r'\d\s-', p) != None:
            p1 = re.sub(r'\d\s-', place_zero, p)
            t = ff.write(p1)
        else:
            t = ff.write(p)
    f.close()

100%|██████████| 900000/900000 [08:29<00:00, 1767.54it/s]


In [None]:
column_names = ["creative_id", "ad_id", "product_id", "advertiser_id", "industry"]
w2v_models = [creative_model, ad_model, product_model, advertiser_model, industry_model]

def transform_dataframe(train_features, column_names, w2v_models):
    for column_name, w2v_model in zip(column_names, w2v_models):
        print(column_name, "START")
        if column_name == "industry":
            embedding_df = train_features[column_name].apply(lambda x: np.zeros(100, ) if pd.isnull(x) else w2v_model[str(int(x))]).apply(pd.Series)
        elif column_name == "product_id":
            embedding_df = train_features[column_name].apply(lambda x: np.zeros(200, ) if pd.isnull(x) else w2v_model[str(int(x))]).apply(pd.Series)
        else:
            embedding_df = train_features[column_name].apply(lambda x: w2v_model[str(x)]).apply(pd.Series)
        train_features = pd.concat([train_features, embedding_df], axis=1).drop(column_name, axis=1)
        print(column_name, "FINISH")
    train_features.to_csv("main_features.csv", index=False)
    print("FINISH save csv!")

In [None]:
train_df = pd.read_csv("embed/train/train_embedding_all_1.csv", header=None)

In [None]:
train_df.head()

In [2]:
train_np = np.loadtxt("embed/train/train_embedding_all_1.csv", delimiter=", ")
train_np.shape

(900000, 801)

In [13]:
train_root = "dataset/train/"
train_user_path = os.path.join(train_root, "user.csv")
train_user = pd.read_csv(train_user_path, index_col="user_id")
train_user.head()
train_user.shape

Unnamed: 0_level_0,age,gender
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,1
2,10,1
3,7,2
4,5,1
5,4,1


(900000, 2)

In [49]:
train_user_id = train_np[:, 0]
uid = train_user_id.astype(int)
train_age = train_user.loc[uid, "age"]
train_gender = train_user.loc[uid, "gender"]

In [53]:
train_features = train_np[:, 1:]
train_age = train_age.values - 1
train_gender = train_gender.values - 1

In [60]:
train_features, valid_features,\
train_age, valid_age,\
train_gender, valid_gender = train_test_split(train_features, train_age, train_gender, test_size=0.33, random_state=42)

In [66]:
(train_np == 0).sum()

1526600

In [77]:
# (train_np[:, 401:601] == 0).sum()
# (train_np[:, 701:801] == 0).sum()
train_np[train_np == 0] = np.nan

In [None]:
lgb_traindata_gender = lgb.Dataset(train_features, train_gender)
lgb_traindata_age = lgb.Dataset(train_features, train_age)

lgb_valdata_gender = lgb.Dataset(valid_features, valid_gender, reference=lgb_traindata_gender)
lgb_valdata_age = lgb.Dataset(valid_features, valid_age, reference=lgb_traindata_age)

# 性别模型的预测

In [None]:
gender_model = lgb_model(model_kind="gender")
gender_model.train(lgb_traindata_gender, lgb_valdata_gender)
gender_model.save_model()

In [None]:
test_record = pd.merge(test_click, test_ad, on="creative_id")

In [None]:
test_features = test_record.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8]]
test_features = test_features.values

In [None]:
test_pred = gender_model.predict(test_features)

In [None]:
test_pred

In [2]:
test_np = np.loadtxt("embed/test/test_embedding_all_1.csv", delimiter=", ")
test_np.shape

(1000000, 801)

In [3]:
cd tencent_algo_2020/

/home/sayhi/workspaces/tencent_algo_2020


In [8]:
import data
import lightgbm as lgb
import numpy as np
import os
import sys
import re
import pandas as pd
import matplotlib
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from gensim.models import word2vec, keyedvectors
import logging

from model import lgb_model

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


print("START loading train embedding and train user info")
train_np = np.loadtxt("embed/train/train_embedding_all_1.csv", delimiter=", ")
train_np[train_np == 0] = np.nan

train_root = "dataset/train/"
train_user_path = os.path.join(train_root, "user.csv")
train_user = pd.read_csv(train_user_path, index_col="user_id")
print("FINISH load train_np, train_user")
print("===========================================================================")

print("START get train_features, train_age, train_gender, and random split train/valid data")
uid = train_np[:, 0].astype(int)
train_age = train_user.loc[uid, "age"]
train_gender = train_user.loc[uid, "gender"]

train_features = train_np[:, 1:]
train_age = train_age.values - 1
train_gender = train_gender.values - 1

train_features, valid_features,\
train_age, valid_age,\
train_gender, valid_gender = train_test_split(train_features,\
                                              train_age,\
                                              train_gender,\
                                              test_size=0.33,\
                                              random_state=42)
print("FINISH random split train/valid data")
print("===========================================================================")

print("START construct lgb train valid data")
lgb_traindata_gender = lgb.Dataset(train_features, train_gender)
lgb_traindata_age = lgb.Dataset(train_features, train_age)

lgb_valdata_gender = lgb.Dataset(valid_features, valid_gender, reference=lgb_traindata_gender)
lgb_valdata_age = lgb.Dataset(valid_features, valid_age, reference=lgb_traindata_age)
print("FINISH construct lgb train valid data")
print("===========================================================================")


START loading train embedding and train user info
FINISH load train_np, train_user
START get train_features, train_age, train_gender, and random split train/valid data
FINISH random split train/valid data
START construct lgb train valid data
FINISH construct lgb train valid data


In [9]:
# 导入已保存模型
gender_model = lgb_model(model_kind="gender")
gender_model.load_model()
age_model = lgb_model(model_kind="age")
age_model.load_model()

In [10]:
print("START valid acc of predict")
# TODO 性别模型的预测
valid_gender_predict = gender_model.predict(valid_features)
valid_gender_predict = gender_model.transform_pred(valid_gender_predict)
acc_gender = accuracy_score(valid_gender_predict, valid_gender)

# TODO 年龄模型的预测
valid_age_predict = age_model.predict(valid_features)
valid_age_predict = age_model.transform_pred(valid_age_predict)
acc_age = accuracy_score(np.array(valid_age_predict), valid_age)

print("In valid data, accuracy of gender is {}, accuracy of age is {}".format(acc_gender, acc_age))
print("FINISH")
print("===========================================================================")

START valid acc of predict
In valid data, accuracy of gender is 0.9190875420875421, accuracy of age is 0.36722895622895624
FINISH


In [None]:
print("START test predict")
test_np = np.loadtxt("embed/test/test_embedding_all_1.csv", delimiter=", ")
test_np[test_np == 0] = np.nan
test_uid = test_np[:, 0].astype(int)
test_features = test_np[:, 1:]

# TODO 性别模型的预测
test_gender_predict = gender_model.predict(test_features)
test_gender_predict = gender_model.transform_pred(test_gender_predict)
# TODO 年龄模型的预测
test_age_predict = age_model.predict(test_features)
test_age_predict = age_model.transform_pred(test_age_predict)

result = pd.DataFrame({"user_id": test_uid, "predicted_age": test_age_predict, "predicted_gender": test_gender_predict})
result.to_csv("results.csv", index=False)

print("FINISH ALL and save result to results.csv")
print("===========================================================================")

In [14]:
train_ad, train_click, train_user, test_ad, test_click = data.load_data()
# train_record
train_record = pd.merge(train_click, train_ad, on="creative_id")
# test_record
test_record = pd.merge(test_click, test_ad, on="creative_id")

In [17]:
grouped = test_record.groupby("user_id")

In [23]:
test_np = np.loadtxt("embed/test/test_embedding_all_1.csv", delimiter=", ")

In [21]:
for user_id, record in grouped:
    print(user_id)
    print(record)
    break

3000001
          time  user_id  creative_id  click_times    ad_id  product_id  \
5220406     49  3000001      1711578            1  1482336     27031.0   
10618442    49  3000001      1797787            1  1556702         NaN   
10620073    29  3000001      1508864            1  1312021         NaN   
12887100    11  3000001       665090            1   589862      1701.0   
13559831    82  3000001      3899689            1  3350034         NaN   
15495357    11  3000001       103064            1    93662      1794.0   
15509685    49  3000001      2259756            1  1949993      8593.0   
18030179    23  3000001       593698            1   527764      8938.0   
24379686     3  3000001       351878            1   315858         NaN   
24805472    54  3000001      2168054            1  1871684     32368.0   
26704009    54  3000001      2477740            1  2134287         NaN   

          product_category  advertiser_id  industry  
5220406                  3          32735     317