In [2]:
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split

#lgmodel = lgb.LGBMClassifier(model_file='model-transR.txt')



### 先训练

In [3]:
all_movie_info = pd.read_csv('/data/data/all_movie_info_1.csv')

In [4]:
for i in all_movie_info.select_dtypes('object').columns:
    lbl = LabelEncoder()
    all_movie_info[i] = lbl.fit_transform(all_movie_info[i].fillna(-1).astype(str))

In [5]:
feature_name = [i for i in all_movie_info if i not in ['star']]
train_data,valid_data,train_label,valid_label = train_test_split(all_movie_info[feature_name],all_movie_info['star'],random_state=1998,test_size=0.2)

In [6]:
dtrain = lgb.Dataset(train_data[feature_name], label=train_label.values)
dval = lgb.Dataset(valid_data[feature_name], label=valid_label.values)

In [7]:
params = {'learning_rate': 0.1,
          'metric': ['auc','binary_logloss'],
          'objective': 'binary',
          'nthread': 32,
          'num_leaves': 16,
          'colsample_bytree': 0.9,
          'bagging_fraction' : 0.9,
          'bagging_freq' : 10,
          'seed' : 2018,
        }

In [8]:
lgb_model = lgb.train(params, dtrain, 1000, dval, verbose_eval=10,early_stopping_rounds=100,)

Training until validation scores don't improve for 100 rounds.
[10]	valid_0's binary_logloss: 0.152222	valid_0's auc: 1
[20]	valid_0's binary_logloss: 0.053906	valid_0's auc: 1
[30]	valid_0's binary_logloss: 0.0201119	valid_0's auc: 1
[40]	valid_0's binary_logloss: 0.00766463	valid_0's auc: 1
[50]	valid_0's binary_logloss: 0.00282877	valid_0's auc: 1
[60]	valid_0's binary_logloss: 0.00110642	valid_0's auc: 1
[70]	valid_0's binary_logloss: 0.000426672	valid_0's auc: 1
[80]	valid_0's binary_logloss: 0.000202648	valid_0's auc: 1
[90]	valid_0's binary_logloss: 0.000106029	valid_0's auc: 0.999999
[100]	valid_0's binary_logloss: 6.84093e-05	valid_0's auc: 0.999999
Early stopping, best iteration is:
[8]	valid_0's binary_logloss: 0.184747	valid_0's auc: 1


### 保存模型

In [9]:
'''
# dump model with pickle
with open('model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
'''

import pickle

with open('model-transR-more-negative.pkl', 'wb') as fout:
    pickle.dump(lgb_model, fout)


# ---------------remove history, more negative example---

### 读取模型

In [3]:
# load model with pickle to predict
with open('model-transR-more-negative.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)

### 测试

In [4]:
def genTest(uid, minfo_canfile):
    '''
    uid: 待测试的user_id
    minfo_canfile: 是候选集电影集合的dataframe，包含除了 uid 、 star %debug影特征
    返回值：返回添加了uid列的dataframe
    '''
    candf = pd.read_csv(minfo_canfile)
    userlist = []
    for i in range(1000):
        userlist.append(uid)
    
    candf['uid'] = userlist
    return candf

### 改进

In [5]:
def isnotin(thelist,element):
    if element in thelist:
        return False
    else:
        return True

In [6]:
def genFinalResult(uidList,destFile,minfo_canfile,model):
    '''
    uidList: test文件，每一行是待测试的 uidid
    destFile: 结果的输出文件，保存格式为每行 uid\tmid1,mid2,...
    minfo_canfile: 是候选集电影集合的dataframe，包含除了 uid 、 star %debug影特征
    model: 已经训练好的model
    '''
    
    # 读取 uidList 中的 uid到 list
    ulist = []
    with open(uidList) as f:
        for line in f:
            uid = line.strip()
            if uid not in ulist:
                ulist.append(int(uid)) # ulist type: str
            
    dest = open(destFile,'w')
    
    # 读取 测试用户的 观影记录
    with open('watched_dict.pickle','rb') as f:
        hisdict = pickle.load(f)
    
    icount = 0
    
    # 对 list 中每一个 uid，生成推荐的50部的电影列表
    for utem in ulist:
        thedf = genTest(int(utem), minfo_canfile)
        hislist = hisdict[utem]
        hislist = set(hislist)
        
        fil_df = thedf[thedf['mid'].map(lambda x:isnotin(hislist,x))]
        
        # 预处理
        for i in fil_df.select_dtypes('object').columns:
            lbl = LabelEncoder()
            fil_df[i] = lbl.fit_transform(fil_df[i].fillna(-1).astype(str))
            
        # 预测，选概率最高的前50    
        result = model.predict(fil_df) 
        fil_df['probability'] = result
        sorted_df = fil_df.sort_values('probability',ascending = False)
        retop50 = pd.DataFrame.head(sorted_df, 50)['mid'].tolist()
        
        rec_str = ""
        
        for mmid in retop50:
            rec_str = rec_str + str(mmid) + ","

        rec_str = rec_str[:-1]
        newline = str(utem) + '\t' + rec_str + '\n'
        dest.write(newline)
        icount = icount + 1
        if icount % 100 == 0 :
            print("The " + str(icount) + " th uid predicted ...")
        
    dest.close()

In [None]:
genFinalResult("UserMovie_test2.txt","result_hot1000_rmhistory_4vs1.txt","minfo_candidate_hottest1000.csv",pkl_bst)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


The 100 th uid predicted ...
The 200 th uid predicted ...
The 300 th uid predicted ...
The 400 th uid predicted ...


### feature importance

In [49]:
from pandas import DataFrame as DF
fi = DF()
feature_name = [i for i in thedf.columns]
fi['name'] = feature_name
fi['split_score'] = pkl_bst.feature_importance(importance_type='split')
fi['gain_score'] = pkl_bst.feature_importance(importance_type='gain')
print(fi)

              name  split_score    gain_score
0              mid         1580  1.326666e+10
1              r_1         1719  1.386412e+08
2              r_2         2211  5.821983e+07
3              r_3          109  9.280557e+06
4              r_4           95  1.250871e+05
5              r_5           84  2.830967e+06
6              r_6           81  1.538176e+05
7              r_7          106  5.359057e+06
8              r_8          135  2.627071e+07
9              r_9           86  2.408912e+06
10            r_10          119  2.974307e+06
11            r_11          106  1.889031e+07
12            r_12          117  7.130919e+05
13            r_13           93  7.876261e+05
14            r_14          112  8.016569e+05
15            r_15           88  2.087294e+06
16            r_16          135  1.639676e+09
17            r_17          106  2.228031e+06
18            r_18          100  2.491491e+06
19            r_19           93  1.376509e+06
20            r_20          102  3

In [6]:
a = ['a','b','6']
print(type(a))

a = set(a)
print(type(a))

<class 'list'>
<class 'set'>
