In [1]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes
from pycuda.compiler import SourceModule
import time
#define CLIP(x, min, max) (fminf(fmaxf(x, min), max))

N = 100  # Adjust as needed

# Define the Row structure
class Row(ctypes.Structure):
    _fields_ = [("col1", ctypes.c_int), ("col2", ctypes.c_float)]



In [2]:
# Create some example row data
rows_data = []

# Define the parallel matrix multiplication kernel
kernel_code = """
struct Row {
    int col1;
    float col2;
};

__global__ void matmul(float *A, float *B, int N, int raters, int movies, Row *rows,float learning_rate) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;  //movies
    int col = blockIdx.x * blockDim.x + threadIdx.x;  //raters
    
    // Access Row data and print it
    //int workingID;
    //float workingRating;
    if (row < movies && col < raters && col>0 && row>0) {
        int found=0;
        float actual_rating = 0.0;
        for(int i=0;i<100;i++){
            int col1_value = rows[(col-1)*100+i].col1;
            //float col2_value = rows[(col-1)*100+i].col2;
            if (col1_value == row){
                found=1;
                //workingID=col1_value;
                //int index=(col-1)*100+i;
                actual_rating=rows[(col-1)*100+i].col2;

                break;
            }
        }
    
        if (found) {
            float predicted_rating = 0.0;
            for (int k = 0; k < N; k++) {
                predicted_rating += A[row * N + k] * B[k * raters + col];
            }

            float error = actual_rating - predicted_rating;
            
            for (int k = 0; k < N; k++) {
                float grad_A = -2.0 * error * B[k * raters + col];
                float grad_B = -2.0 * error * A[row * N + k];

                atomicAdd(&A[row * N + k], -learning_rate * grad_A);
                atomicAdd(&B[k * raters + col], -learning_rate * grad_B);

                


            }
            

        }
    }
}
"""

mod = SourceModule(kernel_code)

def read_csv_to_list_rating(file_path):
    """Read a CSV file into a list of lists with specific column types."""
    with open(file_path, 'r', encoding='utf-8') as file:
        matrix = []
        header = file.readline().strip().split(',')
        for line in file:
            values = line.strip().split(',')
            row = [int(values[0]), int(values[1]), float(values[2]), int(values[3])]
            matrix.append(row)
        return header, matrix

def read_csv_to_list_movies(file_path):
    """Read a CSV file into a list of lists with specific column types."""
    with open(file_path, 'r', encoding='utf-8') as file:
        matrix = []
        header = file.readline().strip().split(',')
        for line in file:
            values = line.strip().split(',')
            row = [int(values[0]), str(values[1]), str(values[2])]
            matrix.append(row)
        return header, matrix



In [3]:
headerRating, matrixrating = read_csv_to_list_rating('ratings.csv')
headerMovies, matrixMovies = read_csv_to_list_movies('movies.csv')

totalRaters = int(matrixrating[-1][0])+1  # total rows
totalMovies = int(matrixMovies[-1][0])+1
print(totalRaters)
print(totalMovies)



200949
292758


In [4]:
d = {}
# row 0 is userID

for row in matrixrating:
    if row[0] not in d:
        d[row[0]] = []
    else:
        d[row[0]].append(list(row[1:4]))

for key, val in d.items():
    d[key] = sorted(val, key=lambda x: x[2])

for key, val in d.items():
    d[key] = d[key][::-1]

newD = {}

for key, val in d.items():
    newD[key] = d[key][:100]

totalUsers = len(newD)

rows_data = []
for i in range(1, (len(newD) + 1)):
    row = newD[i]
    for j in range(100):
        if j < len(row):
            rows_data.append(Row(row[j][0], row[j][1]))
        else:
            rows_data.append(Row(0, 0.0))

matrixAMovies = np.zeros((totalMovies, N), dtype=np.float32)
matrixBUsers = np.zeros((N, totalRaters), dtype=np.float32)
print()
print(len(matrixAMovies))
print(len(matrixAMovies[0]))
print()
print(len(matrixBUsers))
print(len(matrixBUsers[0]))




292758
100

100
200949


In [5]:
# Fill matrices with random values in the range -0.01 to 0.01
matrixAMovies = np.random.uniform(-0.01, 0.01, size=(totalMovies, N)).astype(np.float32)
matrixBUsers = np.random.uniform(-0.01, 0.01, size=(N, totalRaters)).astype(np.float32)
matrixAMovies = np.ascontiguousarray(matrixAMovies, dtype=np.float32)
matrixBUsers = np.ascontiguousarray(matrixBUsers, dtype=np.float32)

A_gpu_matrixAMovies = cuda.mem_alloc(matrixAMovies.nbytes)
B_gpu_matrixBUsers = cuda.mem_alloc(matrixBUsers.nbytes)

if A_gpu_matrixAMovies is None or B_gpu_matrixBUsers is None:
    print("Damn")

rows = (Row * len(rows_data))(*rows_data)

# Allocate memory on the GPU for the rows
rows_gpu = cuda.mem_alloc(ctypes.sizeof(rows))
if rows_gpu is None:
    print("damn")

# Transfer the rows data to the GPU
cuda.memcpy_htod(rows_gpu, rows)

cuda.memcpy_htod(A_gpu_matrixAMovies, matrixAMovies)
cuda.memcpy_htod(B_gpu_matrixBUsers, matrixBUsers)


matmul = mod.get_function("matmul")

# Define grid and block sizes
block_size = (16, 16, 1)  # 16x16 threads per block
grid_size = ((totalRaters + block_size[1] - 1) // block_size[1], 
             (totalMovies + block_size[0] - 1) // block_size[0])


print(f"Matrix A shape: {matrixAMovies.shape}")
print(f"Matrix B shape: {matrixBUsers.shape}")

# Calculate the shared memory size
shared_mem_size = ctypes.sizeof(Row) * min(totalRaters * 100, block_size[0] * block_size[1])
# Run the parallel CUDA kernel
cuda.Context.synchronize()  # Ensure synchronization before execution]
print("A GPU Memory Address:", int(A_gpu_matrixAMovies))
print("B GPU Memory Address:", int(B_gpu_matrixBUsers))
print("Rows GPU Memory Address:", int(rows_gpu))

# start_time = time.time()
# matmul(A_gpu_matrixAMovies, B_gpu_matrixBUsers, np.int32(N), np.int32(totalRaters), np.int32(totalMovies), rows_gpu,learning_rate,
#        block=block_size, grid=grid_size, shared=shared_mem_size)
# cuda.Context.synchronize()  # Ensure synchronization after execution
# end_time = time.time()
# # Copy result back to CPU

# # 🚀 First, copy updated A and B back to CPU
# matrixAMovies_result = np.empty_like(matrixAMovies)
# matrixBUsers_result = np.empty_like(matrixBUsers)

# cuda.memcpy_dtoh(matrixAMovies_result, A_gpu_matrixAMovies)
# cuda.memcpy_dtoh(matrixBUsers_result, B_gpu_matrixBUsers)
# rows_result = np.zeros(len(rows_data), dtype=[('col1', np.int32), ('col2', np.float32)])
# cuda.memcpy_dtoh(rows_result, rows_gpu)




Matrix A shape: (292758, 100)
Matrix B shape: (100, 200949)
A GPU Memory Address: 47343206400
B GPU Memory Address: 47460646912
Rows GPU Memory Address: 47542435840


In [6]:
epochs = 12  # Number of iterations
learning_rate = np.float32(0.001)  # Lower learning rate


for epoch in range(epochs):
    
    matrixAMovies_result = np.empty_like(matrixAMovies)
    matrixBUsers_result = np.empty_like(matrixBUsers)
    rows_result = np.zeros(len(rows_data), dtype=[('col1', np.int32), ('col2', np.float32)])
    start_time = time.time()
    # if epoch % 5 == 0:  # Every 5 epochs, reduce learning rate
    #     learning_rate *= 0.9  # Reduce by 10%

    matmul(A_gpu_matrixAMovies, B_gpu_matrixBUsers, np.int32(N), np.int32(totalRaters),
           np.int32(totalMovies), rows_gpu, learning_rate, block=block_size, grid=grid_size,
           shared=shared_mem_size)
    
    cuda.Context.synchronize()  # Wait for CUDA execution

    # Copy updated matrices and rows back to CPU
    cuda.memcpy_dtoh(matrixAMovies_result, A_gpu_matrixAMovies)
    cuda.memcpy_dtoh(matrixBUsers_result, B_gpu_matrixBUsers)
    cuda.memcpy_dtoh(rows_result, rows_gpu)


    if np.isnan(matrixAMovies_result).any() or np.isnan(matrixBUsers_result).any():
        print("Error: NaN detected in A or B matrix!")
        exit()

    if np.isinf(matrixAMovies_result).any() or np.isinf(matrixBUsers_result).any():
        print("Error: Inf detected in A or B matrix!")
        exit()

    # Compute loss
    mse = 0
    count = 0
    for i in range(len(rows_result)):
        if rows_result[i]['col1'] > 0:
            actual = rows_result[i]['col2']
            predicted = np.dot(matrixAMovies_result[rows_result[i]['col1'], :], matrixBUsers_result[:, i // 100])
            
            if np.isnan(predicted) or np.isinf(predicted):
                print(f"Error in prediction: row {i}, actual {actual}, predicted {predicted}")
                exit()

            mse += (actual - predicted) ** 2
            count += 1

    mse = mse / count if count > 0 else float('inf')  # Avoid division by zero

    end_time = time.time()
    print(f"Epoch {epoch+1}/{epochs}, MSE: {mse:.5f}, Time: {end_time - start_time:.5f} sec")





# Track execution time for CUDA

print(f"Parallel GPU Execution Time (CUDA): {end_time - start_time:.5f} seconds")

Epoch 1/12, MSE: 8.12269, Time: 107.02863 sec
Epoch 2/12, MSE: 2.42787, Time: 107.28975 sec
Epoch 3/12, MSE: 1.83072, Time: 108.48514 sec
Epoch 4/12, MSE: 1.63940, Time: 109.62068 sec
Epoch 5/12, MSE: 1.54110, Time: 109.73906 sec
Epoch 6/12, MSE: 1.48234, Time: 107.81931 sec
Epoch 7/12, MSE: 1.44399, Time: 106.77328 sec
Epoch 8/12, MSE: 1.41750, Time: 104.14234 sec
Epoch 9/12, MSE: 1.39879, Time: 104.32989 sec
Epoch 10/12, MSE: 1.38598, Time: 104.06038 sec
Epoch 11/12, MSE: 1.37795, Time: 104.14620 sec
Epoch 12/12, MSE: 1.37366, Time: 104.37414 sec
Parallel GPU Execution Time (CUDA): 104.37414 seconds


In [7]:
user_id = 1  # Example user (change as needed)
movie_id = rows_result[17]['col1']  # Get the movie ID from rows_result

predicted = np.dot(matrixAMovies_result[movie_id, :], matrixBUsers_result[:, user_id])
print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted:.2f}")


Predicted rating for user 1 and movie 1357: 3.42


In [8]:
predicted

3.4202788

In [9]:
for i in range(100):
    print(f'movie {rows_data[i].col1} rating {rows_data[i].col2}')

movie 2125 rating 4.0
movie 2020 rating 5.0
movie 1392 rating 4.0
movie 926 rating 5.0
movie 2712 rating 1.0
movie 2329 rating 5.0
movie 1952 rating 4.0
movie 1810 rating 3.0
movie 971 rating 3.0
movie 645 rating 4.0
movie 302 rating 4.0
movie 2313 rating 5.0
movie 1228 rating 5.0
movie 1056 rating 5.0
movie 307 rating 5.0
movie 1693 rating 5.0
movie 1406 rating 2.0
movie 1357 rating 1.0
movie 1041 rating 5.0
movie 2520 rating 5.0
movie 1296 rating 3.0
movie 1183 rating 4.0
movie 923 rating 5.0
movie 908 rating 5.0
movie 25 rating 1.0
movie 2312 rating 1.0
movie 1961 rating 1.0
movie 1721 rating 5.0
movie 1247 rating 5.0
movie 1221 rating 5.0
movie 1213 rating 5.0
movie 1150 rating 4.0
movie 1120 rating 1.0
movie 30 rating 5.0
movie 2025 rating 5.0
movie 1719 rating 5.0
movie 1263 rating 5.0
movie 111 rating 5.0
movie 36 rating 1.0
movie 2268 rating 1.0
movie 1276 rating 3.0
movie 1172 rating 3.0
movie 608 rating 2.0
movie 80 rating 5.0
movie 1225 rating 4.0
movie 1203 rating 5.0
movie

In [10]:
listUnused =[]
for m in (matrixrating[0:142]):
    hit=0
    for i in range(100):
        if int(m[1]) ==int(rows_data[i].col1):
            
            hit=1
            break
    if hit==0:
        listUnused.append([int(m[1]),int(m[2])])

In [11]:
len(listUnused)

42

In [12]:
for i in range(len(listUnused)):
    user_id = 1  # Example user (change as needed)
    movie_id = listUnused[i][0]  # Get the movie ID from rows_result
    actualPred = listUnused[i][1]
    predicted = np.dot(matrixAMovies_result[movie_id, :], matrixBUsers_result[:, user_id])
    print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted:.2f}     Actual {actualPred}")

Predicted rating for user 1 and movie 17: 3.38     Actual 4
Predicted rating for user 1 and movie 29: 3.41     Actual 2
Predicted rating for user 1 and movie 32: 2.92     Actual 5
Predicted rating for user 1 and movie 34: 2.85     Actual 2
Predicted rating for user 1 and movie 110: 2.73     Actual 3
Predicted rating for user 1 and movie 166: 2.59     Actual 5
Predicted rating for user 1 and movie 232: 3.53     Actual 5
Predicted rating for user 1 and movie 260: 3.13     Actual 5
Predicted rating for user 1 and movie 356: 2.88     Actual 2
Predicted rating for user 1 and movie 541: 3.33     Actual 5
Predicted rating for user 1 and movie 838: 3.36     Actual 5
Predicted rating for user 1 and movie 1136: 3.55     Actual 1
Predicted rating for user 1 and movie 1196: 3.47     Actual 5
Predicted rating for user 1 and movie 1197: 3.66     Actual 1
Predicted rating for user 1 and movie 1199: 3.49     Actual 2
Predicted rating for user 1 and movie 1210: 3.43     Actual 2
Predicted rating for us

In [13]:
print(listUnused)

[[17, 4], [29, 2], [32, 5], [34, 2], [110, 3], [166, 5], [232, 5], [260, 5], [356, 2], [541, 5], [838, 5], [1136, 1], [1196, 5], [1197, 1], [1199, 2], [1210, 2], [1211, 2], [1236, 4], [1288, 4], [1653, 4], [1748, 5], [1784, 1], [1923, 2], [1965, 3], [2232, 3], [2324, 1], [2336, 5], [2352, 3], [2396, 5], [2529, 5], [2599, 5], [2640, 5], [2724, 1], [2882, 1], [2890, 4], [2918, 4], [2966, 1], [2985, 5], [2997, 4], [3030, 4], [3078, 2], [31, 5]]
