In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Matrix Factorization
I will give two vertions: Rank and Scores.

## Rank

In [2]:
df_scores = pd.read_csv("/home/zhangqiyuan/bench_test/all_benchmark_score.csv", index_col=0)
df_scores.replace("-", np.nan, inplace=True)
df_ranks = df_scores.rank(axis=1, method='min', ascending=False, na_option='keep')

rank_data = np.genfromtxt("/home/zhangqiyuan/bench_test/all_bencmark_rank.csv", delimiter=',', skip_header=1)
non_nan_positions = [(idt, col) for idt, (idx, row) in enumerate(df_ranks.iterrows()) for col in df_ranks.columns if pd.notna(row[col])]
#train_set, validate_set = train_test_split(filtered_df, test_size=0.2, random_state=42)
positions_df = pd.DataFrame(non_nan_positions, columns=['idx', 'idy'])
train_positions, validate_positions = train_test_split(positions_df, test_size=0.1, random_state=42)
column_name_to_index = {name: index for index, name in enumerate(df_ranks.columns)}
train_array_positions = np.array([(row['idx'], column_name_to_index[row['idy']]) for idx, row in train_positions.iterrows()])
validate_array_positions = np.array([(row['idx'], column_name_to_index[row['idy']]) for idx, row in validate_positions.iterrows()])
rank_data = rank_data[:, 1:]


In [3]:


def matrix_factorization(R, P, Q, K, train, steps=5000, alpha=0.0002, beta=0.00002):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T
    previous_step_e = 100000000
    for step in range(steps):
        print("step:", step)
        for i in range(len(R)):
            for j in range(len(R[i])):
                if not np.isnan(R[i][j]) and np.any(np.isin(train, np.array([i,j]))):
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if not np.isnan(R[i][j]) and np.any(np.isin(train, np.array([i,j]))):

                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        print("error:", e)
        
        if e < 0.001 or e > previous_step_e:
            break
        previous_step_e = e
    return P, Q.T

In [4]:
R = rank_data
# N: num of User
N = len(R)
print(N)
# M: num of Movie
M = len(R[0])
print(M)
# Num of Features
K = 20

 
P = np.random.rand(N,K)
Q = np.random.rand(M,K)

 
nP, nQ = matrix_factorization(R, P, Q, K, train_positions)

nR = np.dot(nP, nQ.T)

56
88
step: 0
error: 1614519.4091840878
step: 1
error: 707902.1093277158
step: 2
error: 411771.41999234166
step: 3
error: 364227.3257982541
step: 4
error: 354100.6971995164
step: 5
error: 349301.47340142255
step: 6
error: 345969.18743608164
step: 7
error: 343254.8308939683
step: 8
error: 340823.6147565229
step: 9
error: 338502.5654254955
step: 10
error: 336185.9163430118
step: 11
error: 333799.85994135204
step: 12
error: 331287.19100832724
step: 13
error: 328599.8491114044
step: 14
error: 325695.07378496166
step: 15
error: 322533.4601057029
step: 16
error: 319078.15575747
step: 17
error: 315294.8325126355
step: 18
error: 311152.23885334336
step: 19
error: 306623.2154485277
step: 20
error: 301686.0765409869
step: 21
error: 296326.24768740055
step: 22
error: 290538.0154427203
step: 23
error: 284326.19858057017
step: 24
error: 277707.5071116647
step: 25
error: 270711.3318689381
step: 26
error: 263379.7219865946
step: 27
error: 255766.37423414597
step: 28
error: 247934.57967217368
step: 29

In [5]:
def rmse(R, P, Q, validate):
    '''
    Calculates the root mean square error (RMSE) between the actual ratings and the predicted ratings.
    
    Parameters:
    - R: numpy array, the actual ratings matrix.
    - P: numpy array, the user feature matrix.
    - Q: numpy array, the item feature matrix.
    
    Returns:
    - float, the RMSE between the actual and predicted ratings.
    '''
    Q = Q.T
    predicted_R = np.dot(P, Q)
    error = 0
    count = 0
    for i in range(len(R)):
        for j in range(len(R[i])):
            if not np.isnan(R[i][j]) and np.any(np.isin(validate, np.array([i,j]))):
                error += pow(R[i][j] - predicted_R[i][j], 2)
                count += 1
    return np.sqrt(error / count)

In [6]:
rmse(R, nP, nQ, validate_array_positions)

1.87106783966533

In [None]:
#visualize prediction rank
predicted_R=np.dot(nP, nQ.T)
for position in validate_array_positions:
    print(predicted_R[position[0], position[1]], R[position[0], position[1]])

26.866998434949117 29.0
29.439857442229115 28.0
80.39688326907519 79.0
26.937282923081025 28.0
11.087354535540292 14.0
22.062960213564814 20.0
37.37085839139263 40.0
17.280718706887765 17.0
27.535025847749953 27.0
24.81789186869069 25.0
55.09874350954693 55.0
5.319900394626908 5.0
24.114960219247784 21.0
30.542251685624933 33.0
56.11489598804599 58.0
39.578683861455374 42.0
52.76607800098174 50.0
34.70780045336622 35.0
64.80128305391233 68.0
3.7534111939143115 4.0
12.087418573482815 11.0
43.09761644183478 42.0
18.230432295527294 18.0
11.662785811037994 13.0
64.96123692667149 65.0
24.27589405067017 24.0
62.57839340359225 65.0
60.577482048933824 60.0
39.61792852967501 40.0
10.455991717792033 11.0
21.862736655759598 21.0
52.20358159503278 52.0
22.257598301660884 21.0
22.8002750536842 22.0
25.841546428478285 27.0
70.01652553017371 66.0
65.5956963083352 65.0
16.242615362432606 12.0
7.002734277434562 7.0
25.8761859519022 26.0
67.34065805200308 69.0
5.456300252054056 6.0
66.03940010486315 66.