In [1]:
import pandas as pd
import numpy as np
import warnings
import plotly.express as px
import surprise 
from surprise import SVD,Dataset,Reader,accuracy
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from math import sqrt
from collections import defaultdict
warnings.filterwarnings("ignore")

## 匯入資料集

In [2]:
user_columns=["user_id","age","sex","occupation","zip_code"]
users=pd.read_csv("D:/自己的論文/ml-100k/u.user",sep="|",names=user_columns)
usernum=users.shape[0]
print("Number of users:",usernum)

Number of users: 943


In [3]:
r_col=["user_id","movie_id","rating","unix_timestamp"]
rating_base=pd.read_csv("D:/自己的論文/ml-100k/ua.base",sep="\t", names=r_col,encoding="latin-1")
rating_test=pd.read_csv("D:/自己的論文/ml-100k/ua.test",sep="\t", names=r_col,encoding="latin-1")

trainrate=rating_base.values
testrate=rating_base.values
print(rating_base[0:20])

print("Number of train rates:",trainrate.shape[0])
print("Number of test rates:",trainrate.shape[0])

    user_id  movie_id  rating  unix_timestamp
0         1         1       5       874965758
1         1         2       3       876893171
2         1         3       4       878542960
3         1         4       3       876893119
4         1         5       3       889751712
5         1         6       5       887431973
6         1         7       4       875071561
7         1         8       1       875072484
8         1         9       5       878543541
9         1        10       3       875693118
10        1        11       2       875072262
11        1        12       5       878542960
12        1        13       5       875071805
13        1        14       5       874965706
14        1        15       5       875071608
15        1        16       5       878543541
16        1        17       3       875073198
17        1        18       4       887432020
18        1        19       5       875071515
19        1        21       1       878542772
Number of train rates: 90570
Numbe

In [4]:
#item
i_col=["movie_id", "movie title", "release date", "video date", "IMDB URL", "unknown", "Action", "Adventure",
       "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy","Film-Noir", 
       "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

items=pd.read_csv("D:/自己的論文/ml-100k/u.item",sep="|",names=i_col,encoding="latin-1")

n_items=items.shape[0]
print("Number of items:",n_items)

Number of items: 1682


In [5]:
xitem=items.values

x_train_counts=xitem[:,-19:]
print(x_train_counts.shape)

(1682, 19)


In [6]:
transformer=TfidfTransformer(smooth_idf=True,norm="l2")
tfidf=transformer.fit_transform(x_train_counts.tolist()).toarray()
print(x_train_counts[0])
print(tfidf)

[0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.53676706 0.65097024 ... 0.53676706 0.         0.        ]
 [0.         0.         0.         ... 1.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [7]:
def get_items_rated_by_user(rate_matrix,user_id):
    """
    回傳(item_ids,scores)
    """
    y=rate_matrix[:,0] #users
    #得到使用者評分
    #初始user_id = 1
    ids=np.where(y == user_id +1)[0]
    item_ids=rate_matrix[ids, 1] - 1 #ids - 1
    scores=rate_matrix[ids, 2]
    return (item_ids,scores)

ids,scores=get_items_rated_by_user(trainrate,0)
list_head=np.concatenate((ids.reshape(len(ids),1), scores.reshape(len(scores),1)),axis=1)[0:10]
print(list_head)

[[0 5]
 [1 3]
 [2 4]
 [3 3]
 [4 3]
 [5 5]
 [6 4]
 [7 1]
 [8 5]
 [9 3]]


In [8]:
#接著做一些線性回歸來預測
d=tfidf.shape[1] #數據維度
w=np.zeros((d,usernum))
b=np.zeros((1,usernum))

for n in range(usernum):
    ids,scores=get_items_rated_by_user(trainrate,n)
    clf=Ridge(alpha=0.01,fit_intercept=True)
    xhat=tfidf[ids,:]

    clf.fit(xhat,scores)
    w[:,n]=clf.coef_
    b[0,n]=clf.intercept_

In [9]:
#預測模型
yhat=tfidf.dot(w) + b #課堂公式

In [10]:
n=100
ids,scores=get_items_rated_by_user(testrate,0)
yhat[n,ids]
print("Rate movie ids:",ids)
print("True ratings:",scores)
print("Predicted ratings:",yhat[ids,n])

Rate movie ids: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  20  21  22  23  24  25  26  27  28  29  30  31  33  34  35  36  37
  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
  56  57  58  59  61  62  63  64  65  66  67  68  69  70  71  72  73  74
  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110
 111 112 113 114 115 117 118 119 120 121 122 123 124 125 126 127 128 129
 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 149 150 151 152 153 155 156 157 158 160 161 162 163 164 165 166 167
 168 169 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 189 190 191 192 193 194 195 196 197 198 199 200 202 203 204 205 206
 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
 243 244 245 246 247 248 249 250 25

In [11]:
def evaluate(yhat,rates,W,b):
    se=0
    cnt=0
    for n in range(usernum):
        ids,scores_truth=get_items_rated_by_user(rates,n)
        scores_pred=yhat[ids,n]
        e=scores_truth - scores_pred
        se+=(e*e).sum(axis=0)
        cnt+=e.size
    return sqrt(se/cnt)

In [12]:
print("RMSE for training:",evaluate(yhat,trainrate,w,b))
print("MAE for test:",evaluate(yhat,testrate,w,b))

RMSE for training: 0.9089804562826721
MAE for test: 0.9089804562826721


# 嘗試別的方法

In [13]:
r_cols=["user_id","movie_id","rating","timestamp"]
ratings=pd.read_csv("D:/自己的論文/ml-100k/u.data",sep="\t",names=r_cols,encoding="latin-1")
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [14]:
#畫以前的資料
fig=px.bar(ratings["rating"].value_counts(normalize=True),
           labels={"value":"Count of records (% of total)","index":"Movie Rating"},
           width=800,height=400,
           title="Brealdown of movie ratings count")
fig.update_layout(showlegend=False)
fig.show()

In [15]:
#獲取資料的用戶數和電影數
user_key="user_id"
item_key="movie_id"

N=len(ratings[user_key].unique())
M=len(ratings[item_key].unique())

print(f"Number of users (N): {N}")
print(f"Number of movies (M): {M}")

Number of users (N): 943
Number of movies (M): 1682


In [16]:
print(f"Average number of ratings per user:{round(len(ratings) / N)}")
print(f"Average number of ratings per movie:{round(len(ratings) / M)}")

Average number of ratings per user:106
Average number of ratings per movie:59


In [17]:
#切訓練資料與訓練資料
x=ratings.copy()
y=ratings[user_key]
x_train,x_valid,y_train,y_valid=train_test_split(x,y,test_size=0.2,random_state=42)

x_train.shape,x_valid.shape

((80000, 4), (20000, 4))

In [18]:
#為訓練集與驗證集創建效用矩陣
user_mapper=dict(zip(np.unique(ratings[user_key]),list(range(N))))
item_mapper=dict(zip(np.unique(ratings[item_key]),list(range(M))))

#創建效用矩陣
def create_ymatrix_from_ratings(data,N,M):
    y=np.zeros((N,M))
    y.fill(np.nan)
    for index,val in data.iterrows():
        n=user_mapper[val[user_key]]
        m=item_mapper[val[item_key]]
        y[n,m]=val["rating"]

    return y

In [19]:
#使用函數創造訓練和驗證矩陣
train_mat=create_ymatrix_from_ratings(x_train,N,M)
valid_mat=create_ymatrix_from_ratings(x_valid,N,M)

In [20]:
print(f"Shape of train_mat N x M: {train_mat.shape}")
print(f"Shape of valid_mat N x M: {valid_mat.shape}")

Shape of train_mat N x M: (943, 1682)
Shape of valid_mat N x M: (943, 1682)


In [21]:
#計算RMSE函數
def error(y1,y2):
    #返回RMSE
    return np.sqrt(np.nanmean((y1-y2) ** 2))

def evaluate(pred_y,train_mat,valid_mat,model_name="Global average"):
    #評估訓練和驗證RMSE
    print("%s train RMSE %0.2f" % (model_name,error(pred_y,train_mat)))
    print("%s valid RMSE %0.2f" % (model_name,error(pred_y,valid_mat)))

In [22]:
#預測每個rating的全局平均
global_avg=np.nanmean(train_mat)
pred_g=np.zeros(train_mat.shape) + global_avg

#評估
evaluate(pred_g,train_mat,valid_mat,model_name="Global average")

Global average train RMSE 1.13
Global average valid RMSE 1.12


In [23]:
#將每個用戶預測為每個用戶的用戶平均值
user_avg=np.nanmean(train_mat,axis=1)
pred_n=np.tile(user_avg[:,None],(1,M))

#評估
evaluate(pred_n,train_mat,valid_mat,model_name="Per-user average")

Per-user average train RMSE 1.03
Per-user average valid RMSE 1.04


In [24]:
#將每個評分預測為每部電影的電影平均值
movie_avg=np.nanmean(train_mat,axis=0)
pred_m=np.tile(movie_avg[None,:],(N,1))

#評估
evaluate(pred_m,train_mat,valid_mat,model_name="Per-movie average")

Per-movie average train RMSE 1.00
Per-movie average valid RMSE 1.02


In [25]:
#將每個評分預測為每個用戶和每部電影的平均值
pred_nm=(user_avg[:,None] + movie_avg[None,:]) / 2
evaluate(pred_nm,train_mat,valid_mat,model_name="Per-movie and per-user average")

Per-movie and per-user average train RMSE 0.96
Per-movie and per-user average valid RMSE 0.98


## 使用KNN

In [26]:
#刪掉所有為NAN的列
knn_train_mat=train_mat[:,~np.all(np.isnan(train_mat),axis=0)]
knn_valid_mat=valid_mat[:,~np.all(np.isnan(train_mat),axis=0)]

#用相鄰20個作為補插法法
imputer=KNNImputer(n_neighbors=20)
knn_preds=imputer.fit_transform(knn_train_mat)

#評估
evaluate(knn_preds,knn_train_mat,knn_valid_mat,model_name="Knn")

Knn train RMSE 0.00
Knn valid RMSE 0.97


## 協同過濾

In [27]:
ratings_drop=ratings.drop(columns="timestamp")

reader=Reader()
data=Dataset.load_from_df(ratings_drop,reader) #載入資料

trainset,validset=surprise.model_selection.train_test_split(data,test_size=0.2,random_state=42)

In [28]:
#訓練模型與評估
k=10 #將維度減少到k
s_svd=SVD(n_factors=k,random_state=42)
s_svd.fit(trainset)
s_svd_preds=s_svd.test(validset)

results=cross_validate(s_svd,data,measures=["RMSE","MAE"],cv=5,verbose=True)
pd.DataFrame(results).mean()

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9346  0.9474  0.9393  0.9430  0.9354  0.9399  0.0048  
MAE (testset)     0.7367  0.7493  0.7406  0.7445  0.7403  0.7423  0.0043  
Fit time          0.53    0.61    0.54    0.73    0.59    0.60    0.07    
Test time         0.13    0.23    0.15    0.13    0.22    0.17    0.04    


test_rmse    0.939930
test_mae     0.742253
fit_time     0.599112
test_time    0.171661
dtype: float64

In [29]:
#餵進模型
trainset,validset=surprise.model_selection.train_test_split(
    data,test_size=0.2,random_state=42
)

k=10
algo=SVD(n_factors=k,random_state=42)
algo.fit(trainset)
svd_preds=algo.test(validset)
accuracy.rmse(svd_preds,verbose=True)

RMSE: 0.9328


0.932839700199596

In [30]:
def get_top_n(predictions,n=10):
    #首先將預測map到每個用戶
    top_n=defaultdict(list)
    for uid,iid,true_r,est, _ in predictions:
        top_n[uid].append((iid,est))

    #然後對每個用戶的預測進行排序，並檢索k個最高的
    for uid,user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1],reverse=True)
        top_n[uid]=user_ratings[:n]

    return top_n

def top_n_recs(user_id,n=5):
    top_n=get_top_n(svd_preds,n=n)
    return pd.DataFrame(top_n[user_id],columns=["movie_id","pred"])

In [31]:
#獲取5個隨機用戶的前5個推薦
user_id_sample=ratings["user_id"].sample(5).to_list()
n=5
for user_id in user_id_sample:
    print("\nTop %d recommendations for user %d" % (n,user_id))
    print(top_n_recs(user_id))


Top 5 recommendations for user 355
   movie_id      pred
0       306  4.758351
1       882  4.202545
2       288  4.188433
3       271  3.954318
4       689  3.691379

Top 5 recommendations for user 586
   movie_id      pred
0        22  3.914017
1       195  3.854375
2       186  3.763902
3       117  3.738260
4       735  3.725849

Top 5 recommendations for user 95
   movie_id      pred
0        22  4.621665
1       144  4.212069
2        95  4.146178
3        71  4.092669
4        28  4.063005

Top 5 recommendations for user 856
   movie_id      pred
0       316  4.016693
1       749  3.143291
2       322  2.932883
3       678  2.527667

Top 5 recommendations for user 268
   movie_id      pred
0        50  4.023189
1       114  3.931475
2       238  3.844342
3       223  3.765577
4       525  3.738543


## 內容過濾

In [32]:
#只取電影的類型
genres=[
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
movie_genres=items[genres]
movie_genres.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [33]:
z=movie_genres.to_numpy()
z.shape
print("Average number of genres per movie: %.1f" % (z.sum()/M))

Average number of genres per movie: 1.7


In [34]:
def get_x_y_per_user(ratings_df, d=z.shape[1]):
    """
    Returns X and y for each user.

    Parameters:
    ----------
    ratings_df : pandas.DataFrame
         ratings data as a dataframe

    d : int
        number of item features

    Return:
    ----------
        dictionaries containing X and y for all users
    """
    lr_y = defaultdict(list)
    lr_X = defaultdict(list)

    for index, val in ratings_df.iterrows():
        n = user_mapper[val[user_key]]
        m = item_mapper[val[item_key]]
        lr_X[n].append(z[m])
        lr_y[n].append(val["rating"])

    for n in lr_X:
        lr_X[n] = np.array(lr_X[n])
        lr_y[n] = np.array(lr_y[n])

    return lr_X, lr_y

In [35]:
x_train_user,y_train_user=get_x_y_per_user(x_train)
x_valid_user,y_valid_user=get_x_y_per_user(x_valid)

In [36]:
print("User 1:")
print(f"Shape of x_train_user for one user (movies x genres): {pd.DataFrame(x_train_user[1]).shape}")
print(f"Shape of x_train_user for one user (movies x rating): {pd.DataFrame(y_train_user[1]).shape}")
print("")
print("User 25:")
print(f"shape of x_train_user for another user (movie x genres): {pd.DataFrame(x_train_user[25]).shape}")
print(f"shape of x_train_user for another user (movie x user rating): {pd.DataFrame(y_train_user[25]).shape}")


User 1:
Shape of x_train_user for one user (movies x genres): (46, 18)
Shape of x_train_user for one user (movies x rating): (46, 1)

User 25:
shape of x_train_user for another user (movie x genres): (85, 18)
shape of x_train_user for another user (movie x user rating): (85, 1)


In [37]:
def train_user(user_name,model=Ridge()):
    x=x_train_user[user_name]
    y=y_train_user[user_name]
    model.fit(x,y)
    return model

def predict_user(model):
    feat_vecs=movie_genres
    preds=model.predict(feat_vecs)
    return preds

In [38]:
#為第一個user訓練和預測
model_1=train_user(0)
preds=predict_user(model_1)
recon_x=pd.DataFrame(preds)

recon_x.head()

Unnamed: 0,0
0,2.399218
1,2.863537
2,3.549176
3,3.892106
4,3.755807


In [39]:
train_mat

array([[nan,  3.,  4., ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 5., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan,  5., nan, ..., nan, nan, nan]])

In [40]:
#為每個使用者訓練和預測
users=range(1,train_mat.shape[0])

for i in users:
    model=train_user(i)
    scores=predict_user(model)
    recon_x[i]=pd.DataFrame(scores)

In [41]:
#在效用矩陣中所有電影的預測
recon_x.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,2.399218,2.863537,3.549176,3.892106,3.755807,4.043851,4.691223,2.828793,4.043851,3.88062,...,3.325411,4.043851,4.043851,4.043851,4.043851,4.043851,3.870601,4.365276,3.629682,4.043851
1,3.478037,4.368664,3.162929,4.69979,3.860238,3.819291,3.459575,3.703311,3.819291,3.94057,...,3.550115,3.819291,3.819291,3.819291,3.819291,3.819291,3.612328,4.26869,3.565369,3.819291
2,2.513182,2.525054,2.101808,3.048204,3.000602,2.934748,2.71418,2.732854,2.934748,2.909098,...,2.417158,2.934748,2.934748,2.934748,2.934748,2.934748,2.633232,3.466172,2.513182,2.934748
3,5.031081,3.461739,4.403304,4.966239,4.753188,4.818288,4.48491,5.214774,4.818288,4.818288,...,4.154768,4.818288,4.818288,4.818288,4.818288,4.818288,4.022303,4.437287,5.031081,4.818288
4,3.459727,2.511443,2.216249,3.459372,3.918273,2.736089,3.5408,2.591529,2.736089,2.90319,...,2.330173,2.736089,2.736089,2.736089,2.736089,2.736089,1.403143,1.922983,2.977611,2.736089


In [42]:
evaluate(recon_x.T,train_mat,valid_mat,model_name="Content-Based filtering")

Content-Based filtering train RMSE 0.91
Content-Based filtering valid RMSE 1.05
