In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from model.booster import GBDT_Muti
from model.dataset import DataSet
from model.configs import Configs

In [2]:
dt = pd.read_csv("data/Delicious/Delicious_data.txt", sep="\t", header=None)

In [3]:
info = dt[0][0].split(" ")
N = int(info[0]) # Num_Points
D = int(info[1]) # Num_Features
L = int(info[2]) # Num_Labels

In [4]:
dt = dt.drop(0)

In [5]:
dt.head()

Unnamed: 0,0
1,"77,91,315,544,575,621,718,818,819,834,908 60:1..."
2,"82,99,205,357,365,386,387,395,396,398,470,625,..."
3,"76,332,333,453,552,799 6:1.000000 7:1.000000 1..."
4,"332,333,339,353,456,507,573,574,615,731,783,78..."
5,"77,104,108,205,223,275,276,285,286,296,378,381..."


In [6]:
labels_onehot = []
features_onehot = []
for thing in tqdm(dt[0]):
    thing = str(thing)
    all_thing = thing.split(" ",1)
    labels = all_thing[0].split(",")
    try:
        labels = [int(i) for i in labels]
    except:
        labels = []
    features = all_thing[1].split(" ")
    features = {int(i.split(':')[0]):float(i.split(':')[1]) for i in features}
    label_onehot = [0]*L
    feature_onehot = [0.0]*D
    for i in labels:
        label_onehot[i] = 1
    for i in features:
        feature_onehot[i] = features[i]
    labels_onehot.append(label_onehot)
    features_onehot.append(feature_onehot)


100%|██████████| 16105/16105 [00:01<00:00, 15391.54it/s]


In [7]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
features_onehot = pca.fit_transform(features_onehot)

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(np.array(features_onehot), np.array(labels_onehot), test_size=0.2, random_state=42)

In [9]:
y_train.shape

(12884, 983)

In [10]:
x_train.shape

(12884, 100)

In [11]:
dataset_train = DataSet(x_train, y_train)
dataset_test = DataSet(x_test, y_test)
configs = Configs('configs/configs_delicious.json')

In [12]:
gbdt = GBDT_Muti(configs)
gbdt.fit(dataset_train,dataset_test)

iter1 : valid loss=18.966160
iter1 : train loss=17.103588
iter2 : valid loss=17.190654
iter2 : train loss=16.468521
iter3 : valid loss=16.673671
iter3 : train loss=16.149860
iter4 : valid loss=16.455227
iter4 : train loss=15.944632
iter5 : valid loss=16.335535
iter5 : train loss=15.767733
iter6 : valid loss=16.259470
iter6 : train loss=15.650057
iter7 : valid loss=16.215842
iter7 : train loss=15.542323
iter8 : valid loss=16.188664
iter8 : train loss=15.464611
iter9 : valid loss=16.160058
iter9 : train loss=15.405812
iter10 : valid loss=16.148877
iter10 : train loss=15.351508
iter11 : valid loss=16.138807
iter11 : train loss=15.345163
iter12 : valid loss=16.132874
iter12 : train loss=15.288275
iter13 : valid loss=16.129685
iter13 : train loss=15.240553
iter14 : valid loss=16.122815
iter14 : train loss=15.236802
iter15 : valid loss=16.119557
iter15 : train loss=15.202321
iter16 : valid loss=16.115647
iter16 : train loss=15.199449
iter17 : valid loss=16.112963
iter17 : train loss=15.19709

In [13]:
pred_prob = gbdt.predict_set_prob(dataset_test.X)
pred_labels = gbdt.predict_set_label(dataset_test.X)

In [14]:
pred_prob_df = pd.DataFrame(pred_prob)
pred_labels_df = pd.DataFrame(pred_labels)
pred_prob_df.to_csv('./result/deli_pred_prob.csv', index=False)
pred_labels_df.to_csv('./result/deli_pred_label.csv', index=False)

In [15]:
from sklearn.metrics import accuracy_score, f1_score, hamming_loss

In [16]:
f1_score_macro = f1_score(y_test, pred_labels_df, average='macro')
f1_score_macro

0.02674511895961375

In [17]:
f1_score_micro = f1_score(y_test, pred_labels_df, average='micro')
f1_score_micro

0.1375949590719039

In [18]:
acc = accuracy_score(y_test, pred_labels_df)
acc

0.0027941633033219497

In [19]:
ham = hamming_loss(y_test, pred_labels_df)
ham

0.01850079100056439

In [20]:
record = pd.read_csv('record.csv')


In [21]:
dataset_name = 'Delicious'
record = record.append([{
    "dataset": dataset_name,
    "learn_rate": configs.learn_rate,
    "max_depth": configs.max_depth,
    "stop_iter": gbdt.stop_iter,
    "f1_score_macro":f1_score_macro,
    "f1_score_micro":f1_score_micro,
    "acc":acc,
    "ham":ham
    }])
record.to_csv('record.csv', index=False)

In [55]:
def get_sorted_top_k(pred_prob, top_k=1, reverse=False):
    """ pred_prob: [N*L]@ndarray """
    top_k_idx = np.argsort(pred_prob, axis=1)
    if reverse:
        return top_k_idx[:, -top_k:]
    else:
        return top_k_idx[:, :top_k]

def precision_at_k(pred_prob, pred_label, targets, k):
    """ All array is [N*L]@ndarray """
    assert k>=1 and k <= np.size(targets, 1)
    topkidx = get_sorted_top_k(pred_prob, top_k=k, reverse=True)
    tp = 0
    fp = 0
    for j in range(k):
        for i in range(len(pred_prob)):      # 取每个样本
            jth_idx = topkidx[i, j]
            if pred_label[i, jth_idx] > 0:   # 预测为正例
                if targets[i, jth_idx] > 0:  # 真实为正例
                    tp += 1
                else:
                    fp += 1
    p_at_k = tp / (tp + fp)
    return p_at_k

In [56]:
precision_at_k(pred_prob, pred_labels, y_test, k=1)

0.7048755186721992

In [57]:
precision_at_k(pred_prob, pred_labels, y_test, k=3)


0.6986839067190025