In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [3]:
# import data
import lightgbm as lgb
import numpy as np
import os
import sys
import pandas as pd
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score

%matplotlib inline

In [25]:
def get_data():
    train_root = "dataset/train/"
    train_user_path = os.path.join(train_root, "user.csv")
    train_ad_path = os.path.join(train_root, "ad.csv")
    train_click_path = os.path.join(train_root, "click_log.csv")
    test_root = "dataset/test/"
    test_ad_path = os.path.join(test_root, "ad.csv")
    test_click_path = os.path.join(test_root, "click_log.csv")
    
    train_ad = pd.read_csv(train_ad_path, na_values="\\N")
    train_click = pd.read_csv(train_click_path, na_values="\\N")
    train_user = pd.read_csv(train_user_path, na_values="\\N")
    test_ad = pd.read_csv(test_ad_path, na_values="\\N")
    test_click = pd.read_csv(test_click_path, na_values="\\N")
    
    return train_ad, train_click, train_user, test_ad, test_click


def get_part_click(total_click, list_user_id, on="user_id"):
    part_record = pd.merge(total_click, list_user_id, on=on)
    return part_record


def get_ad_inform(creative_id, data_ad):
    ad_inform = data_ad[data_ad["creative_id"] == creative_id]
#     print(ad_inform.astype(int))
    return ad_inform.astype(int)


def split_feature_target(raw_features):
    train_features = raw_features.iloc[:, [0, 2, 3, 6, 7, 8, 9, 10]]
    train_age = raw_features.iloc[:, 4]
    train_gender = raw_features.iloc[:, 5]
    
    return train_features, train_age, train_gender

def measure_unique_user(record_pred, data_record, data_user, column_name="gender"):
    df_pred = pd.DataFrame(data_record.user_id)
    df_pred[column_name] = np.array(record_pred)
    
    uni_user_pred = df_pred.groupby("user_id").agg({column_name: lambda x: x.value_counts().index[0]})
    pred = uni_user_pred.iloc[:, 0].values + 1
    target = data_user.sort_values("user_id")[column_name].values
    acc_score = accuracy_score(pred, target)
    
    return uni_user_pred, acc_score

In [4]:
train_ad, train_click, train_user, test_ad, test_click = get_data()

In [5]:
train_user, valid_user = train_test_split(train_user, test_size=0.33, random_state=42)

In [6]:
train_record = get_part_click(train_click, train_user)
valid_record = get_part_click(train_click, valid_user)

In [7]:
# train_record
train_raw_features = pd.merge(train_record, train_ad, on="creative_id")
# valid_record
valid_raw_features = pd.merge(valid_record, train_ad, on="creative_id")

In [8]:
train_features, train_age, train_gender = split_feature_target(train_raw_features)
valid_features, valid_age, valid_gender = split_feature_target(valid_raw_features)

train_features = train_features.values
train_age = train_age.values - 1
train_gender = train_gender.values - 1

valid_features = valid_features.values
valid_age = valid_age.values - 1
valid_gender = valid_gender.values - 1

In [9]:
train_features.shape

(20151356, 8)

In [41]:
import data

class MODEL:
    def __init__(self, train_dataset, valid_dataset, model_kind="gender"):
        '''
        @train_dataset: lgb.Dataset(X, y)
        @valid_dataset: lgb.Dataset(X, y)
        '''
        self.train_dataset = train_dataset
        self.valid_dataset = valid_dataset
        
        self.params = {
            'task': 'train',
            'boosting_type': 'gbdt',  # 设置提升类型
            'num_leaves': 47,  # 叶子节点数
            'learning_rate': 0.1,  # 学习速率
            'feature_fraction': 0.9,  # 建树的特征选择比例
            'bagging_fraction': 0.8,  # 建树的样本采样比例
            'lambda_l2': 0.01,
            'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
            'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
        }
        self.categorical_feature = [1, 3, 4, 5, 6, 7]
        self.num_interations = 100
        
        self.model_kind = model_kind
        if self.model_kind == "gender":
            self.params["metric"] = {"binary_logloss", "auc"}
            self.params["objective"] = "binary"
            self.model_save_path = "checkpoints/gender_model.pkl"
        elif self.model_kind == "age":
            self.params["metric"] = {"softmax"}
            self.params["objective"] = "multiclass"
            self.params['num_class'] = 10
            self.model_save_path = "checkpoints/age_model.pkl"

    def train(self):
        self.gbm = lgb.train(self.params,
                        self.train_dataset,
                        num_boost_round=self.num_interations,
                        valid_sets=self.valid_dataset,
                        early_stopping_rounds=10,
                        categorical_feature=self.categorical_feature)        
    
    def get_model(self):
        return self.gbm
    
    def save_model(self):
        self.gbm.save_model(self.model_save_path)
        
    def predict(self, input_features):
        pred = self.gbm.predict(input_features, num_iteration=self.gbm.best_iteration)
        return pred
    
    def transform_pred(self, pred):
        if self.model_kind == "gender":
            record_pred_label = pred.copy()
            record_pred_label[pred >= 0.5] = 1
            record_pred_label[pred < 0.5] = 0
            record_acc = accuracy_score(record_pred_label.astype(int), valid_gender)
        elif self.model_kind == "age":
            record_pred_label = [list(x).index(max(x)) for x in pred]
            record_acc = accuracy_score(np.array(age_pred_label), valid_age)
            
        print("accuracy for {} is: {:.5f}".format(self.model_kind, record_acc))
        
        return record_pred_label
    
    def measure(self, record_pred_label, data_record, data_user, column_name=self.model_kind):
        uni_pred, uni_acc = data.measure_unique_user(record_pred_label, data_record, data_user, column_name)
        return uni_pred, uni_acc

m = MODEL()

In [10]:
lgb_traindata_gender = lgb.Dataset(train_features, train_gender)
lgb_traindata_age = lgb.Dataset(train_features, train_age)

lgb_valdata_gender = lgb.Dataset(valid_features, valid_gender, reference=lgb_traindata_gender)
lgb_valdata_age = lgb.Dataset(valid_features, valid_age, reference=lgb_traindata_age)

In [37]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
#     'objective': 'regression',  # 目标函数
    'num_leaves': 31,  # 叶子节点数
    'learning_rate': 0.1,  # 学习速率
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'lambda_l2': 0.01,
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
categorical_feature = [1, 3, 4, 5, 6, 7]

params_gender = params.copy()
params_gender["metric"] = {"binary_logloss", "auc"}
params_gender["objective"] = "binary"
model_save_path = "checkpoints/gender_model.pkl"

gbm_gender = lgb.train(params_gender, 
                       lgb_traindata_gender, 
                       num_boost_round=100, 
                       valid_sets=lgb_valdata_gender, 
                       early_stopping_rounds=10,
                       categorical_feature=categorical_feature)
gbm_gender.save_model(model_save_path)

[1]	valid_0's auc: 0.686178	valid_0's binary_logloss: 0.612375
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.69345	valid_0's binary_logloss: 0.603091
[3]	valid_0's auc: 0.702861	valid_0's binary_logloss: 0.595275
[4]	valid_0's auc: 0.70558	valid_0's binary_logloss: 0.588851
[5]	valid_0's auc: 0.707777	valid_0's binary_logloss: 0.583397
[6]	valid_0's auc: 0.709984	valid_0's binary_logloss: 0.578668
[7]	valid_0's auc: 0.71118	valid_0's binary_logloss: 0.574576
[8]	valid_0's auc: 0.713203	valid_0's binary_logloss: 0.570996
[9]	valid_0's auc: 0.715	valid_0's binary_logloss: 0.568381
[10]	valid_0's auc: 0.71598	valid_0's binary_logloss: 0.565319


KeyboardInterrupt: 

In [12]:
# Predicting
gen_pred = gbm_gender.predict(valid_features, num_iteration=gbm_gender.best_iteration)
gen_pred

array([0.01006011, 0.01006011, 0.01006011, ..., 0.05080076, 0.05226905,
       0.04350702])

In [13]:
gen_pred_label = gen_pred.copy()

gen_pred_label[gen_pred >= 0.5] = 1
gen_pred_label[gen_pred < 0.5] = 0

np.unique(gen_pred_label.astype(int))
np.unique(valid_gender)

gen_acc = accuracy_score(gen_pred_label.astype(int), valid_gender)
print("accuracy for gender is: {:.5f}".format(gen_acc))

array([0, 1])

array([0, 1])

accuracy for gender is: 0.73253


In [14]:
uni_user_gender_pred, unique_user_gender_acc = measure_unique_user(gen_pred_label, valid_record, valid_user, column_name="gender")
print("accuracy for unique users' gender: {}".format(unique_user_gender_acc))

accuracy for unique users' gender: 0.6408316498316499


# age model

In [15]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
#     'objective': 'regression',  # 目标函数
    'num_leaves': 47,  # 叶子节点数
    'learning_rate': 0.1,  # 学习速率
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'lambda_l2': 0.01,
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

categorical_feature = [1, 3, 4, 5, 6, 7]
params_age = params.copy()
params_age["metric"] = {"softmax"}
params_age['num_class'] = 10
params_age['objective'] = "multiclass"
model_save_path = "checkpoints/age_model.pkl"

gbm_age = lgb.train(params_age, lgb_traindata_age, num_boost_round=100,
                   valid_sets=lgb_valdata_age, early_stopping_rounds=10,
                   categorical_feature=categorical_feature)
gbm_age.save_model(model_save_path)

[1]	valid_0's multi_logloss: 2.0449
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 2.03802
[3]	valid_0's multi_logloss: 2.03195
[4]	valid_0's multi_logloss: 2.0266
[5]	valid_0's multi_logloss: 2.02195
[6]	valid_0's multi_logloss: 2.01773
[7]	valid_0's multi_logloss: 2.01404
[8]	valid_0's multi_logloss: 2.01066
[9]	valid_0's multi_logloss: 2.00762
[10]	valid_0's multi_logloss: 2.00482
[11]	valid_0's multi_logloss: 2.00222
[12]	valid_0's multi_logloss: 1.99978
[13]	valid_0's multi_logloss: 1.99762
[14]	valid_0's multi_logloss: 1.99558
[15]	valid_0's multi_logloss: 1.99373
[16]	valid_0's multi_logloss: 1.99197
[17]	valid_0's multi_logloss: 1.99037
[18]	valid_0's multi_logloss: 1.98888
[19]	valid_0's multi_logloss: 1.98741
[20]	valid_0's multi_logloss: 1.98605
[21]	valid_0's multi_logloss: 1.98478
[22]	valid_0's multi_logloss: 1.98354
[23]	valid_0's multi_logloss: 1.98241
[24]	valid_0's multi_logloss: 1.98131
[25]	valid_0's multi_logloss: 1.98028


<lightgbm.basic.Booster at 0x7f83b0295c50>

In [16]:
# Predicting
age_pred = gbm_age.predict(valid_features, num_iteration=gbm_age.best_iteration)
age_pred

array([[0.04727743, 0.25087161, 0.29262795, ..., 0.00138141, 0.00166182,
        0.00099429],
       [0.04731405, 0.25029148, 0.29285456, ..., 0.00138248, 0.00166311,
        0.00099506],
       [0.04727743, 0.25087161, 0.29262795, ..., 0.00138141, 0.00166182,
        0.00099429],
       ...,
       [0.03046138, 0.17342223, 0.19229391, ..., 0.03368783, 0.0056943 ,
        0.00106193],
       [0.03132991, 0.18673266, 0.23254126, ..., 0.02966815, 0.00488683,
        0.00099639],
       [0.04264644, 0.21895324, 0.24463094, ..., 0.01690729, 0.00464181,
        0.00098304]])

In [17]:
age_pred.shape
age_pred_label = [list(x).index(max(x)) for x in age_pred]

(9931415, 10)

In [18]:
set(age_pred_label)
np.unique(valid_age)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [19]:
print("record ACC score: {}".format(accuracy_score(np.array(age_pred_label), valid_age)))

record ACC score: 0.237633207352628


In [20]:
uni_user_age_pred, unique_user_age_acc = measure_unique_user(age_pred_label, valid_record, valid_user, column_name="age")
print("unique users' age ACC score: {}".format(unique_user_age_acc))


unique users' age ACC score: 0.21176430976430977


In [36]:
train_user.age.value_counts()
valid_user.age.value_counts()
uni_user_age_pred.iloc[:, 0].value_counts()

3     135790
4     100727
2      99806
5      87711
6      68283
7      44721
1      23604
8      21535
9      13120
10      7703
Name: age, dtype: int64

3     67119
4     49851
2     49465
5     42956
6     33437
7     21990
1     11591
8     10432
9      6354
10     3805
Name: age, dtype: int64

2    245170
1     21381
4     13662
3      7870
5      5668
6      1219
0       954
9       665
8       229
7       182
Name: (age, <lambda>), dtype: int64

In [35]:
train_user.gender.value_counts()

valid_user.gender.value_counts()
uni_user_gender_pred.iloc[:, 0].value_counts()

1    403703
2    199297
Name: gender, dtype: int64

1    198907
2     98093
Name: gender, dtype: int64

0.0    271826
1.0     25174
Name: (gender, <lambda>), dtype: int64

In [5]:
test_uid = np.array([1, 2, 3])
test_age_predict = np.array([1, 5, 8])
test_gender_predict = np.array([1, 0, 1])
result = pd.DataFrame({"user_id": test_uid, "predicted_age": test_age_predict, "predicted_gender": test_gender_predict})
result

Unnamed: 0,user_id,predicted_age,predicted_gender
0,1,1,1
1,2,5,0
2,3,8,1
