In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate
from developed_methods import *

In [6]:
df = pd.read_csv("data/train.csv")
dtest_submit = pd.read_csv("data/test.csv")

## mapping 
from sklearn import preprocessing
le_user = preprocessing.LabelEncoder()
le_user.fit(np.append(df['user_id'], dtest_submit["user_id"]))
df['user_id'] = le_user.transform(df["user_id"])
dtest_submit["user_id"] = le_user.transform(dtest_submit["user_id"])

le_item = preprocessing.LabelEncoder()
le_item.fit(np.append(df['item_id'], dtest_submit["item_id"]))
df["item_id"] = le_item.transform(df["item_id"])
dtest_submit["item_id"] = le_item.transform(dtest_submit["item_id"])

## generate train / test dataset
from sklearn.model_selection import train_test_split
dtrain, dtest = train_test_split(df, test_size=0.33, random_state=42)

## save real ratings for test set for evaluation.
test_rating = np.array(dtest['rating'])

## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

In [7]:
# train_pair, train_rating
train_pair = dtrain[['user_id', 'item_id']].values
train_rating = dtrain['rating'].values

# test_pair
test_pair = dtest[['user_id', 'item_id']].values
n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

In [9]:
## baseline user mean methods
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_rating)
pred_user = user_ave.predict(test_pair)
print('RMSE for user_mean: %.3f' %rmse(test_rating, pred_user) )

RMSE for user_mean: 1.978


In [11]:
## baseline item mean methods
item_ave = item_mean(n_item=n_item)
item_ave.fit(train_pair=train_pair, train_ratings=train_rating)
pred_item = item_ave.predict(test_pair)
print('RMSE for item_mean: %.3f' %rmse(test_rating, pred_item) )

RMSE for item_mean: 2.877


In [14]:
## CV based on `LFM_CV`
## Baseline + LFM
glb_ave = glb_mean()
glb_ave.fit(train_rating)
pred = glb_ave.predict(test_pair)
# user_mean
train_rating_cm = train_rating - glb_ave.predict(train_pair)
user_ave = user_mean(n_user=n_user)
user_ave.fit(train_pair=train_pair, train_ratings=train_rating_cm)
train_rating_res = train_rating_cm - user_ave.predict(train_pair)
dtrain['res_rating'] = train_rating_res
pred = pred + user_ave.predict(test_pair)
# fit LFM_CV by residual ratings 
Ks, lams = [2, 3, 5], 10**np.arange(-6, -2, .5)
shiing_cv = LFM_CV(n_user, n_item, cv=3, Ks=Ks, lams=lams)
shiing_cv.grid_search(train_pair, train_rating_res)
shiing_cv.plot_grid('valid')
shiing_cv.plot_grid('train')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtrain['res_rating'] = train_rating_res


3-Fold CV for K: 2; lam: 0.00000: train_rmse: 1.677, valid_rmse: 2.052
3-Fold CV for K: 2; lam: 0.00000: train_rmse: 1.642, valid_rmse: 2.103
3-Fold CV for K: 2; lam: 0.00000: train_rmse: 1.675, valid_rmse: 2.074


KeyboardInterrupt: 

In [None]:
## refit the best model, and make prediction
best_K, best_lam = int(shiing_cv.best_model['K']), shiing_cv.best_model['lam']
print('best K: %d, best lam: %.5f' %(best_K, best_lam))
shiing=LFM(n_user, n_item, K=best_K, lam=best_lam)
shiing.fit(train_pair, train_rating_res)
pred = pred + shiing.predict(test_pair)
print('RMSE for glb + user_mean + LFM: %.3f' %rmse(test_rating, pred))

In [None]:
from sklearn.preprocessing import StandardScaler

user_info = pd.DataFrame({'user_id': list(range(n_user))})
user_info = user_info.set_index('user_id')
user_info['mean'] = dtrain.groupby('user_id')['res_rating'].mean()
user_info['q1'] = dtrain.groupby('user_id')['res_rating'].quantile(.1)
user_info['q3'] = dtrain.groupby('user_id')['res_rating'].quantile(.3)
user_info['q5'] = dtrain.groupby('user_id')['res_rating'].quantile(.5)
user_info['q7'] = dtrain.groupby('user_id')['res_rating'].quantile(.7)
user_info['q7'] = dtrain.groupby('user_id')['res_rating'].quantile(.9)
## fill NAN as the column mean
user_info = user_info.fillna(user_info.mean())
user_scaler = StandardScaler()
user_info = user_scaler.fit_transform(user_info)

item_info = pd.DataFrame({'item_id': list(range(n_item))})
item_info = item_info.set_index('item_id')
item_info['mean'] = dtrain.groupby('item_id')['res_rating'].mean()
item_info['q1'] = dtrain.groupby('item_id')['res_rating'].quantile(.1)
item_info['q3'] = dtrain.groupby('item_id')['res_rating'].quantile(.3)
item_info['q5'] = dtrain.groupby('item_id')['res_rating'].quantile(.5)
item_info['q7'] = dtrain.groupby('item_id')['res_rating'].quantile(.7)
item_info['q7'] = dtrain.groupby('item_id')['res_rating'].quantile(.9)
## fill NAN as the column mean
item_info = item_info.fillna(item_info.mean())
item_scaler = StandardScaler()
item_info = item_scaler.fit_transform(item_info)

In [None]:
print(user_info)
print(item_info)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
user_sim = cosine_similarity(user_info)
item_sim = cosine_similarity(item_info)

In [None]:
top = 5
index_item = [np.where(train_pair[:,1] == i)[0] for i in range(n_item)]
index_user = [np.where(train_pair[:,0] == u)[0] for u in range(n_user)]
## augmented data
fake_pair, fake_rating = [], []
for u in range(n_user):
	print('UserId: %d' %u)
	### find the top closest users for the user u
	top_user_tmp = user_sim[u].argsort()[-top:][::-1]
	valid_user_ind = []
	### extend the records' index for the users
	for u_tmp in top_user_tmp:
		valid_user_ind.extend(index_user[u_tmp])
	### find observed items under top users
	obs_item_tmp = train_pair[valid_user_ind,1]
	for i in range(n_item):
		### find top items 
		top_item_tmp = item_sim[i].argsort()[-top:][::-1]
		### find valid item: intersect with top-items and observed item
		valid_item_tmp = np.intersect1d(top_item_tmp, obs_item_tmp)
		if len(valid_item_tmp) == 0:
			continue
		valid_item_ind = []
		for i_tmp in valid_item_tmp:
			### extend all rating index for valid item
			valid_item_ind.extend(index_item[i_tmp])
		### find index close to (u,i)
		valid_ind = np.intersect1d(valid_user_ind, valid_item_ind)
		if len(valid_ind) > 0:
			fake_pair.append([u,i])
			fake_rating.append(train_rating_res[valid_ind].mean())
fake_pair, fake_rating = np.array(fake_pair), np.array(fake_rating)

In [None]:
aug_pair, aug_rating_res = np.vstack((train_pair, fake_pair)), np.hstack((train_rating_res, fake_rating))

## fit the LFM model with augmentated dataset
K, lam = 5, 0.0001
sSVD=LFM(n_user, n_item, K=K, lam=lam)
sSVD.fit(aug_pair, aug_rating_res)