In [1]:
import pickle
import os
import re
import copy
from collections import Counter

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
%matplotlib inline

# 1.读取数据

In [3]:
data_df = pd.read_pickle('../data/processed/data_df.pkl')

In [4]:
usr_df = pd.read_pickle('../data/processed/usr_df.pkl')

In [5]:
item_df = pd.read_pickle('../data/processed/item_df.pkl')

# 2.抽取用户和物品各自特征并进行处理

针对独立类别特征进行分桶(categories)

相关方法参见:https://www.kaggle.com/shahules/an-overview-of-encoding-techniques

针对数值型特征(年龄)进行scale之后normalization

即先将所有值减去最小值后除以最大最小值之差,在此基础上减去特征的均值并除以标准差

## 2.1处理用户特征

In [6]:
occupation_lst = [i.strip() for i in open('../data/raw_data/u.occupation','r',encoding='utf-8').readlines()]

In [7]:
usr_feature_df = pd.DataFrame()
usr_feature_df['usr_id'] = usr_df['usr_id'].copy()

In [8]:
def scale_and_norm(df,colname):
    '''连续数值型特征归一化和正规化
    '''
    df = df.copy()
    max_ = max(df[colname])
    min_ = min(df[colname])
    df[colname] = df[colname].apply(lambda x:(x-min_)/(max_-min_))

    mean_scaled = np.mean(df[colname])
    std_scaled = np.std(df[colname])

    df[colname] = df[colname].apply(lambda x:(x-mean_scaled)/std_scaled)
    return df

In [9]:
def target_encoding(df,colname,cate_lst=None):
    '''
    此处使用target encoding方法 将类别在总体占比作为其特征值(cate_lst基本无用)
    '''
    df = df.copy()
    col_value = df[colname].tolist()
    col_counter = Counter(col_value)

    col_frac = {k:col_counter[k]/sum(col_counter.values()) for k in col_counter}

    df[colname] = df[colname].apply(lambda x:col_frac[x])
    return df

In [10]:
def onehot_encoding(df,colname,cate_lst=None):
    '''离散型特征分桶类别化(categorization)
    cate_lst为对应类别的既有顺序 默认为None 如果有的话 类别化时参照lst中的顺序进行编码
    '''
    df = df.copy()
    col_value = df[colname].tolist()
    if cate_lst is None:
        cate_lst = list(set(col_value))
    df[colname] = df[colname].apply(lambda x:cate_lst.index(x))
    return df

In [11]:
usr_feature_df['age'] = [int(i) for i in usr_df['age']]
usr_feature_df['gender'] = usr_df['gender']
usr_feature_df['occupation'] = usr_df['occupation']

In [12]:
usr_feature_df = scale_and_norm(usr_feature_df,'age')
usr_feature_df = target_encoding(usr_feature_df,'gender')
usr_feature_df = target_encoding(usr_feature_df,'occupation')

## 2.2 处理物品特征

In [13]:
movie_type = [i.strip().split('|')[0] for i in open('../data/raw_data/u.genre').readlines()]

In [14]:
movie_type = movie_type[:-1]

In [15]:
item_feature_df = pd.DataFrame()
item_feature_df['item_id'] = item_df['item_id']
item_feature_df['movie_title'] = item_df['movie_title'].apply(lambda x:x.split('(')[0])

In [16]:
item_feature_df['release_year'] = item_df['release_date'].apply(lambda x:x.split('-')[-1])

In [17]:
item_feature_df = target_encoding(item_feature_df,'release_year')

In [18]:
item_type_matrix = item_df[movie_type].to_numpy()

In [19]:
item_type_matrix = item_type_matrix.astype(int)

In [20]:
item_feature_df['type_feature'] = [i for i in item_type_matrix]

In [21]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# movie_title_text = item_feature_df['item_id'].tolist()

# tfidf_obj = TfidfVectorizer(max_features=50)

# item_tfidf_feature = tfidf_obj.fit_transform(movie_title_text)

# item_tfidf_feature = item_tfidf_feature.toarray()

# item_feature_df['tfidf_feature'] = [i for i in item_tfidf_feature]

In [22]:
usr_feautre = usr_feature_df[['age','gender','occupation']].copy().to_numpy()
usr_feature_df = pd.DataFrame()
usr_feature_df['usr_id'] = usr_df['usr_id'].copy()
usr_feature_df['features'] = [i for i in usr_feautre]

In [23]:
item_feature = []
for num,_ in enumerate(item_feature_df.itertuples()):
    year = item_feature_df.loc[num,'release_year']
    type_feature = item_feature_df.loc[num,'type_feature']
    feature = np.hstack([np.array([year]),type_feature])
    item_feature.append(feature)

In [24]:
item_feature_df = pd.DataFrame()
item_feature_df['item_id'] = item_df['item_id'].copy()
item_feature_df['features'] = [i for i in item_feature]

In [25]:
def get_Xy(df,
           usr_feautre_df=usr_feature_df,
           item_feature_df=item_feature_df):
    '''给定数据df 按照顺序输出特征和对应标签
    拼接之后的数据
    '''
    df = df.copy()
    uid_lst = [int(i) for i in df['usr_id'].tolist()]
    iid_lst = [int(i) for i in df['item_id'].tolist()]

    u_feature = [usr_feature_df.loc[i-1,'features'] for i in uid_lst]

    u_feature = np.array(u_feature)

    i_feature = [item_feature_df.loc[i-1,'features'] for i in iid_lst]

    i_feature = np.array(i_feature)

    X = np.hstack([u_feature,i_feature])
    y = [int(i) for i in df['rating'].tolist()]
    y = np.array(y)
    return X,y

In [26]:
item_feature_df.to_pickle('../data/processed/item_feature_df.pkl')
usr_feature_df.to_pickle('../data/processed/usr_feature_df.pkl')

# 3.基线模型

评价指标为RMSE,MSE

# 3.1作为回归任务来做

每个用户和物品的交互评分作为一个标注,训练回归模型.测试集上对应进行预测即可.

模型输入:用户特征和物品特征拼接,对应标注为rating分数.
模型输出:rating分数.

In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
import lightgbm

### 3.1.1 五折交叉验证

四折的训练集数据训练，另外一折数据的测试集做测试。

In [29]:
cv_df_lst = []
for i in range(5):
    df_fname = '../data/processed/cv_{0}_df.pkl'.format(i+1)
    df = pd.read_pickle(df_fname)
    cv_df_lst.append(df)

In [30]:
best_param = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'regression',
          'nthread': 8, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.1,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'scale_pos_weight': 1,
          'num_class' : 1,}

In [31]:
model = lightgbm.LGBMRegressor(**best_param)

In [32]:
mse_lst = []
for test_idx in range(5):
    print('----------------')
    print('Test on cv_{0}_df'.format(test_idx+1))
    train_idx_lst = [i for i in range(5) if i!=test_idx]
    # 训练过程 其余四份test集合作为训练集 当前作为测试
    for train_idx in train_idx_lst:
        df = cv_df_lst[train_idx]
        train_df = df[df['type']=='test']
        train_X,train_y = get_Xy(train_df)
        model.fit(train_X,train_y)
    # 测试过程
    df = cv_df_lst[test_idx]
    test_df = df[df['type']=='test']
    test_X,test_true_y = get_Xy(test_df)
    test_pred_y = model.predict(test_X)
    mse = mean_squared_error(test_true_y,test_pred_y)
    mse_lst.append(mse)
    print('MSE on cv_{0}_df:{1}'.format(i+1,mse))
print('Average MSE:{0}'.format(np.average(mse_lst)))

----------------
Test on cv_1_df
MSE on cv_5_df:1.1294285794135475
----------------
Test on cv_2_df
MSE on cv_5_df:1.0860408137803688
----------------
Test on cv_3_df
MSE on cv_5_df:1.0485731813061572
----------------
Test on cv_4_df
MSE on cv_5_df:1.0336211280458831
----------------
Test on cv_5_df
MSE on cv_5_df:1.0389689167348395
Average MSE:1.0673265238561593
