In [19]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
book_ratings = pd.read_csv('data/BX-CSV-Dump/BX-Book-Ratings.csv', sep=';')
# books = pd.read_csv('BX-Books1.csv', sep=';')
users = pd.read_csv('data/BX-CSV-Dump/BX-Users.csv', sep=';')

In [20]:

book_ratings.sample(10)
users.sample(10)
book_ratings.isnull().sum(), users.isnull().sum()
users.shape


(278858, 3)

In [21]:
# 填充用户评分的空值为0
book_ratings.fillna(0.0, inplace=True)
# 用户的年龄用均值填充，由于缺失数量大，但是特征重要
users['Age'].fillna(users['Age'].mean(), inplace=True)
# 只要国家，不要详情的地区了
users['Location'] = users['Location'].str.split(',').apply(lambda str_list:str_list[-1])
# users.Location.value_counts()
# 国家也不要了
users.drop('Location', axis=1, inplace=True)
dataset = pd.merge(book_ratings, users, on='User-ID')
dataset[dataset['Book-Rating'] > 0].shape
# 处理标签
# dataset['Book-Rating'].apply(lambda x:1 if x>0 else 0).value_counts()
dataset['Book-Rating'] = dataset['Book-Rating'].apply(lambda x:1 if x>0 else 0)

user_item, data = dataset[['User-ID', 'ISBN']], dataset[['Book-Rating', 'Age']]


In [22]:
# 取出 1/10 作为后面要增量训练的数据
online_learning_train, train = data[:114978], data[114978:]


from sklearn.model_selection import train_test_split

X, y = train.drop(columns=['Book-Rating']), train['Book-Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [31]:
from sklearn.linear_model import SGDClassifier

In [32]:
%%time
lr_sgd = SGDClassifier(loss='log', warm_start=True).fit(X_train, y_train)

CPU times: user 6.13 s, sys: 53.8 ms, total: 6.19 s
Wall time: 6.53 s


In [33]:
lr_sgd.score(X_test, y_test)


0.6280796865109851

In [34]:
def get_batch(online_learning_train):
    for row in online_learning_train.iterrows():
        # 生成器，每次返回一个要训练的样本
        yield row[1]['Book-Rating'], row[1]['Age']
        
batch_generator = get_batch(online_learning_train)

In [35]:
# 增量更新模型
label, feature = next(batch_generator)
lr_sgd.partial_fit([[feature]], [label])

SGDClassifier(loss='log', warm_start=True)

In [36]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=True)

SGDClassifier(loss='log', warm_start=True)

In [37]:
for i in range(20):
    label, feature = next(batch_generator)
    lr_sgd = lr_sgd.partial_fit([[feature]], [label])# 注意要返回这个对象
    print(lr_sgd.score(X_test, y_test))
    print(lr_sgd.coef_)

0.6280796865109851
[[-0.02085006]]
0.6280796865109851
[[-0.02236489]]
0.6280796865109851
[[-0.01956109]]
0.6280796865109851
[[-0.01680143]]
0.6280796865109851
[[-0.01968254]]
0.6280796865109851
[[-0.01294196]]
0.6280796865109851
[[-0.01064327]]
0.6280796865109851
[[-0.00452945]]
0.6280796865109851
[[-0.00019888]]
0.6280796865109851
[[-0.00421519]]
0.6280796865109851
[[-0.00791561]]
0.6280796865109851
[[-0.01133285]]
0.6280796865109851
[[-0.01449704]]
0.6280796865109851
[[-0.01743536]]
0.6280796865109851
[[-0.02017197]]
0.6280796865109851
[[-0.0155741]]
0.6280796865109851
[[-0.01785704]]
0.6280796865109851
[[-0.02005468]]
0.6280796865109851
[[-0.01546106]]
0.6280796865109851
[[-0.01103756]]
