In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Matrix Factorization
I will give two vertions: Rank and Scores.

## Rank

In [2]:
df_scores = pd.read_csv("/home/zhangqiyuan/bench_test/all_benchmark_score.csv", index_col=0)
df_scores.replace("-", np.nan, inplace=True)
#df_ranks = df_scores.rank(axis=1, method='min', ascending=False, na_option='keep')

score_data = np.genfromtxt("/home/zhangqiyuan/bench_test/all_benchmark_score.csv", delimiter=',', skip_header=1)
non_nan_positions = [(idt, col) for idt, (idx, row) in enumerate(df_scores.iterrows()) for col in df_scores.columns if pd.notna(row[col])]
#train_set, validate_set = train_test_split(filtered_df, test_size=0.2, random_state=42)
positions_df = pd.DataFrame(non_nan_positions, columns=['idx', 'idy'])
train_positions, validate_positions = train_test_split(positions_df, test_size=0.1, random_state=42)
column_name_to_index = {name: index for index, name in enumerate(df_scores.columns)}
train_array_positions = np.array([(row['idx'], column_name_to_index[row['idy']]) for idx, row in train_positions.iterrows()])
validate_array_positions = np.array([(row['idx'], column_name_to_index[row['idy']]) for idx, row in validate_positions.iterrows()])
score_data = score_data[:, 1:]
score_data

array([[0.238, 0.243, 0.264, ..., 0.525, 0.53 , 0.386],
       [0.464, 0.581, 0.457, ...,   nan,   nan,   nan],
       [0.238, 0.326, 0.294, ..., 0.749, 0.702, 0.629],
       ...,
       [  nan,   nan,   nan, ..., 0.578, 0.519, 0.397],
       [  nan,   nan,   nan, ..., 0.445, 0.497, 0.312],
       [  nan,   nan,   nan, ..., 0.088, 0.117, 0.023]])

In [3]:
#normalization
original_nan_positions = np.isnan(score_data)
array_temp = np.where(original_nan_positions, 0, score_data)
row_norms = np.linalg.norm(array_temp, axis=1, keepdims=True)
row_norms[row_norms == 0] = 1

normalized_array = array_temp / row_norms

# Restore original NaN values
normalized_array[original_nan_positions] = np.nan

score_data = normalized_array

In [4]:


def matrix_factorization(R, P, Q, K, train, steps=7000, alpha=0.0002, beta=0.00002):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T
    previous_step_e = 100000000
    for step in range(steps):
        print("step:", step)
        for i in range(len(R)):
            for j in range(len(R[i])):
                if not np.isnan(R[i][j]) and np.any(np.isin(train, np.array([i,j]))):
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if not np.isnan(R[i][j]) and np.any(np.isin(train, np.array([i,j]))):

                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        print("error:", e)
        
        if e < 0.001 or e > previous_step_e:
            break
        previous_step_e = e
    return P, Q.T

In [5]:
R = score_data
# N: num of User
N = len(R)
print(N)
# M: num of Movie
M = len(R[0])
print(M)
# Num of Features
K = 15

 
P = np.random.rand(N,K)
Q = np.random.rand(M,K)

 

nP, nQ = matrix_factorization(R, P, Q, K, train_positions)

nR = np.dot(nP, nQ.T)

56
88
step: 0
error: 29106.930184700446
step: 1
error: 22636.398112916995
step: 2
error: 18091.284016177655
step: 3
error: 14777.107967024484
step: 4
error: 12286.87905553045
step: 5
error: 10369.094968003343
step: 6
error: 8861.461201287942
step: 7
error: 7655.405320798406
step: 8
error: 6676.045765077997
step: 9
error: 5870.370228964955
step: 10
error: 5199.98925778595
step: 11
error: 4636.546423591105
step: 12
error: 4158.726849463272
step: 13
error: 3750.2577677933004
step: 14
error: 3398.5419352826193
step: 15
error: 3093.7047534030166
step: 16
error: 2827.917813452794
step: 17
error: 2594.9108156259977
step: 18
error: 2389.6141707407983
step: 19
error: 2207.8937468381546
step: 20
error: 2046.3515596891161
step: 21
error: 1902.1743040497925
step: 22
error: 1773.0170307326027
step: 23
error: 1656.9129445599644
step: 24
error: 1552.2028256254894
step: 25
error: 1457.4793406475278
step: 26
error: 1371.5427585977557
step: 27
error: 1293.3654771525269
step: 28
error: 1222.0634119634701

In [6]:
def rmse(R, P, Q, validate):
    '''
    Calculates the root mean square error (RMSE) between the actual ratings and the predicted ratings.
    
    Parameters:
    - R: numpy array, the actual ratings matrix.
    - P: numpy array, the user feature matrix.
    - Q: numpy array, the item feature matrix.
    
    Returns:
    - float, the RMSE between the actual and predicted ratings.
    '''
    Q = Q.T
    predicted_R = np.dot(P, Q)
    error = 0
    count = 0
    for i in range(len(R)):
        for j in range(len(R[i])):
            if not np.isnan(R[i][j]) and np.any(np.isin(validate, np.array([i,j]))):
                error += pow(R[i][j] - predicted_R[i][j], 2)
                count += 1
    return np.sqrt(error / count)

In [7]:
rmse(R, nP, nQ, validate_array_positions)
with open('norm_score_nP.npy', 'wb') as f:
    np.save(f, nP)
with open('norm_score_nQ.npy', 'wb') as f:
    np.save(f, nQ)

In [9]:
#visualization predicted score
predicted_R=np.dot(nP, nQ.T)
for position in validate_array_positions:
    print(predicted_R[position[0], position[1]], R[position[0], position[1]])

0.059776822258974696 0.04087906282998202
0.09975565584713321 0.12056795581660754
0.053094527349905406 0.041545480537934616
0.1100602042229059 0.1213977395138023
0.19391846013411512 0.1739523740198809
0.14533888531684805 0.1287797268250796
-0.061294685288476335 -0.03751057082015702
0.17676549778318176 0.18117219884780283
0.15247376962391077 0.12839519600720245
0.14513356746574252 0.16104947925902854
0.12600152870418801 0.10879970252293641
0.1279992498390616 0.13971546498847026
0.11491972312575682 0.12294200839246197
0.11045258529336967 0.11232887703042732
0.06711517698313663 0.07342710143547414
0.07534141545521157 0.11378494845702344
0.06027932463227989 0.04477797729367133
0.12988495878258954 0.12128491428786353
0.002037538393946842 0.00553862933853129
0.14448196963936957 0.17074191120975285
0.1718948864179305 0.1847448637460305
0.08494914306019949 0.11447442300220795
0.1278128940097449 0.17640568586860553
0.1347759045128479 0.13555372773349456
0.07414363162136488 0.029028622963186843
0

In [10]:
# convert predicted score to rank
nan_mask = np.isnan(score_data)
predict_score = np.dot(nP, nQ.T)
predict_score[nan_mask] = np.nan
predict_score = pd.DataFrame(predict_score)
predict_rank = predict_score.rank(axis=1, method='min', ascending=False, na_option='keep')

In [11]:
# evaluation
rank_data = np.genfromtxt("/home/zhangqiyuan/bench_test/all_bencmark_rank.csv", delimiter=',', skip_header=1)
rank_data = rank_data[:, 1:]
error = 0
count = 0
error_1 = []
for position in validate_array_positions:
    print(position, predict_rank.iloc[position[0], position[1]], rank_data[position[0], position[1]])
    error += pow(rank_data[position[0], position[1]] - predict_rank.iloc[position[0], position[1]], 2)
    error_1.append(abs(rank_data[position[0], position[1]] - predict_rank.iloc[position[0], position[1]]))
    count += 1
print(np.sqrt(error / count), sum(error_1)/count)
print(error_1)

[27 50] 35.0 29.0
[ 2 45] 44.0 28.0
[3 3] 79.0 79.0
[38  6] 33.0 28.0
[48 13] 4.0 14.0
[33  0] 17.0 20.0
[43  4] 2.0 40.0
[ 6 15] 13.0 17.0
[13 25] 10.0 27.0
[37  0] 30.0 25.0
[29 41] 27.0 55.0
[29 35] 24.0 5.0
[ 2 19] 26.0 21.0
[ 0 44] 27.0 33.0
[ 0 56] 70.0 58.0
[ 2 24] 63.0 42.0
[26 57] 54.0 50.0
[29 55] 19.0 35.0
[25 17] 83.0 68.0
[20 40] 13.0 4.0
[36 52] 15.0 11.0
[5 5] 56.0 42.0
[36 15] 26.0 18.0
[29 25] 16.0 13.0
[32 66] 62.0 65.0
[ 7 25] 23.0 24.0
[21 12] 62.0 65.0
[21 18] 50.0 60.0
[46 16] 36.0 40.0
[20 23] 9.0 11.0
[ 7 24] 28.0 21.0
[24 20] 40.0 52.0
[13 24] 47.0 21.0
[30 18] 19.0 22.0
[26 44] 15.0 27.0
[25 65] 78.0 66.0
[31  2] 62.0 65.0
[ 2 22] 35.0 12.0
[40 26] 7.0 7.0
[45  5] 10.0 26.0
[23  0] 65.0 69.0
[ 2 37] 37.0 6.0
[ 2 60] 12.0 66.0
[17 27] 16.0 10.0
[31 52] 48.0 58.0
[26 27] 52.0 24.0
[31  1] 66.0 64.0
[34  6] 25.0 26.0
[33 20] 22.0 14.0
[ 4 70] 3.0 2.0
[ 5 46] 19.0 18.0
[13 42] 11.0 12.0
[14 56] 34.0 32.0
[39  0] 34.0 27.0
[30 35] 24.0 13.0
[7 4] 46.0 45.0
[10 31] 