In [11]:
import pandas as pd
import numpy as np
import os
from surprise import SVD, SVDpp, Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy

In [3]:
# # Change working directory to the directory of this script
# os.chdir(os.path.dirname(os.path.abspath(__file__)))

# Load the dataset
data = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp'])

# Pivot the data to get the user-item matrix
matrix = data.pivot(index='user', columns='item', values='rating').fillna(0)
print(matrix)

# Dataset selection
local_file = 'Gau_biased_data_n10.csv'

item  1     2     3     4     5     6     7     8     9     10    ...  1673  \
user                                                              ...         
1      5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   0.0   
2      4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   0.0   
3      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
5      4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
939    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   0.0   
940    0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   0.0   
941    5.0   0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0  ...   0.0   
942    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
943    0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0

Gaussian-Biased

In [46]:
# Calculate the overall average rating
mu = data['rating'].mean()

# Calculate the bias for each user and item
user_bias_mean = data.groupby('user')['rating'].mean() - mu
item_bias_mean = data.groupby('item')['rating'].mean() - mu

# Define the standard deviation for the Gaussian distribution
std_dev = 0.1  # You can adjust this value as needed

# Initialize the bias terms using a Gaussian distribution
user_bias = np.random.normal(loc=user_bias_mean, scale=std_dev, size=matrix.shape[0])
item_bias = np.random.normal(loc=item_bias_mean, scale=std_dev, size=matrix.shape[1])



Stochastic Gradient Descent

In [47]:
# # only update the elements we known

# Define the lambda regularization parameter and the learning rate
lambda_reg = 0.01
learning_rate = 0.005
n_iter = 30

# Perform gradient descent
for _ in range(n_iter):
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            if matrix.iloc[i, j] > 0:  # only for rated items
                error = matrix.iloc[i, j] - (mu + user_bias[i] + item_bias[j])
                user_bias[i] += learning_rate * (error - lambda_reg * user_bias[i])
                item_bias[j] += learning_rate * (error - lambda_reg * item_bias[j])



In [48]:
print(item_bias)
print(user_bias)

[ 0.27736995 -0.29886959 -0.53054703 ... -1.55193715 -0.63243901
 -0.54727525]
[ 8.37862973e-02  1.40548085e-01 -4.07560597e-01  9.68840454e-01
 -3.73246275e-01 -1.40268608e-01  4.81361161e-01  2.58647218e-01
  5.27866945e-01  4.15889138e-01  4.26250598e-03  6.13762639e-01
 -2.31731788e-01  3.00422702e-01 -4.34882511e-01  6.18529666e-01
 -5.14955590e-01  9.69745099e-02 -5.36181590e-02 -4.47258964e-01
 -3.38255428e-01 -1.12497222e-02 -4.46958371e-02  6.02467888e-01
  1.93609775e-01 -4.13266478e-01 -1.49107216e-01  1.84882048e-01
  1.29228485e-01  4.10025530e-01  1.85824116e-01 -1.21734508e-01
  4.72278938e-01  8.22160527e-01 -1.51017301e-02  7.20516608e-01
  2.59229797e-01  5.90275144e-01  2.63435598e-01 -5.29239255e-01
 -1.19214122e-01  2.13004945e-01  2.05416408e-01  5.27697891e-02
  3.48124494e-01  5.04842527e-01  2.15855624e-01 -7.35154783e-03
 -6.00838008e-01  6.79871649e-02 -3.64100114e-01  6.41362307e-01
  2.54857553e-01  3.78426590e-01 -2.41244944e-01  2.87136754e-01
  2.1875452

Update the dataset.

In [49]:
updated_data = matrix.copy()

# Add the biases to the original ratings, only where a rating exists
for i in range(matrix.shape[0]):
    for j in range(matrix.shape[1]):
        if matrix.iloc[i, j] > 0:
            updated_rating = matrix.iloc[i, j] + user_bias[i] + item_bias[j]
            # Clip the rating to the range 0-5
            updated_data.iloc[i, j] = np.clip(updated_rating, 0, 5)

# Reset the index to prepare for melting
updated_data_reset = updated_data.reset_index()

# Melt the DataFrame
melted_df = updated_data_reset.melt(id_vars='user', var_name='item', value_name='rating')

# # Add the timestamps from the original data
# melted_df['timestamp'] = data['timestamp']

# Save to a CSV file
melted_df.to_csv(local_file, index=False, header=False, sep='\t')

Revise the dataset, remove all zero elements to meet the required format. (Please run this only once for each new csv file. Because we have revised all the csv files in the folder, we just comment this code. You can bring them back if you want.)

In [6]:
# df = pd.read_csv(local_file, sep='\t', header=None, names=['user_id', 'movie_id', 'rating'])

# # Convert the 'rating' column to numeric, and set invalid parsing to be set as NaN
# df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# # Remove the rows where 'rating' is 0
# df = df[df['rating'] != 0]

# df.to_csv(local_file, index=False, sep=',', header=False)

SVD

In [8]:
#SVD


# Load CSV file into pandas DataFrame
df = pd.read_csv(local_file, sep=',', header=None, names=['user_id', 'movie_id', 'rating'])

# Initialize a Reader with rating scale of 0 to 5
reader = Reader(rating_scale=(0, 5))

# Load the DataFrame into surprise Dataset
data = Dataset.load_from_df(df, reader)

# Split the Dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Initialize the SVD algorithm
algo = SVD()

# Train the algorithm on the training set
algo.fit(trainset)

# Predict ratings for the test set
predictions = algo.test(testset)

# Compute RMSE of the predictions
accuracy.rmse(predictions)


RMSE: 0.8708


0.8708097841478214

SVD++

In [9]:
# SVD++

# Load CSV file into pandas DataFrame
df = pd.read_csv(local_file, sep=',', header=None, names=['user_id', 'movie_id', 'rating'])

# Initialize a Reader with rating scale of 0 to 5
reader = Reader(rating_scale=(0, 5))

# Load the DataFrame into surprise Dataset
data = Dataset.load_from_df(df, reader)

# Split the Dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Initialize the SVD++ algorithm
svdpp_model = SVDpp()

# Train the algorithm on the training set
svdpp_model.fit(trainset)

# Predict ratings for the test set
predictions = svdpp_model.test(testset)

# Compute RMSE of the predictions
accuracy.rmse(predictions)

# Set the user_id and movie_id for which a prediction is desired
user_id = '196'
movie_id = '302'

# Predict the rating of the user for the movie
predicted_rating = svdpp_model.predict(user_id, movie_id).est

# Print out the predicted rating
print(f"user {user_id} for movie {movie_id} predicted rating：{predicted_rating:.2f}")

RMSE: 0.8521
user 196 for movie 302 predicted rating：3.52


RBM

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.python.ops import control_flow_ops

In [None]:

# Initialize the RBM class
class RBM():
    def __init__(self, input_size, output_size, learning_rate=1.0, batch_size=100):
        # Define hyperparameters, weights and biases
        self._input_size = input_size
        self._output_size = output_size
        self.learning_rate = learning_rate
        self.batch_size = batch_size

        # Initialize weights and biases
        self.w = tf.Variable(tf.random.normal((input_size, output_size), mean=0., stddev=0.01), name='weights')
        self.hb = tf.Variable(tf.zeros([output_size]), name='hidden_bias')
        self.vb = tf.Variable(tf.zeros([input_size]), name='visible_bias')
    
    # Calculate the probabilities of the hidden units given the visible units
    def prob_h_given_v(self, visible, w, hb):
        return tf.nn.sigmoid(tf.matmul(visible, w) + hb)

     # Calculate the probabilities of the visible units given the hidden units
    def prob_v_given_h(self, hidden, w, vb):
        return tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(w)) + vb)

     # Sample a probability distribution
    def sample_prob(self, probs):
        return tf.nn.relu(tf.sign(probs - tf.random.uniform(tf.shape(probs))))

     # Train the RBM
    def train(self, X, epochs=20):
        loss = []
        for epoch in range(epochs):
            for start, end in zip(range(0, len(X), self.batch_size), range(self.batch_size, len(X), self.batch_size)):
                batch = X[start:end]
                h0 = self.sample_prob(self.prob_h_given_v(batch, self.w, self.hb))
                v1 = self.sample_prob(self.prob_v_given_h(h0, self.w, self.vb))
                h1 = self.prob_h_given_v(v1, self.w, self.hb)

                positive_grad = tf.matmul(tf.transpose(batch), h0)
                negative_grad = tf.matmul(tf.transpose(v1), h1)

                self.w = self.w + self.learning_rate * (positive_grad - negative_grad) / tf.dtypes.cast(tf.shape(batch)[0], tf.float32)
                self.vb = self.vb +  self.learning_rate * tf.reduce_mean(batch - v1, 0)
                self.hb = self.hb +  self.learning_rate * tf.reduce_mean(h0 - h1, 0)
            loss.append(tf.reduce_mean(tf.square(batch - v1)))
            print('Epoch %d: loss is %.3f' % (epoch, loss[-1]))
        return loss

# Load Data
df = pd.read_csv(local_file, sep=',', header=None, names=['user_id', 'movie_id', 'rating'])
print(df.columns)

# Preprocessing
n_users = df.user_id.unique().shape[0]
n_movies = df['movie_id'].max()

# Create a user-item matrix which can be used to train the model
ratings = np.zeros((n_users, n_movies))
ratings = ratings.astype(np.float32)
print(ratings.shape)
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]

# Split into training and test datasets
training_set, test_set = train_test_split(ratings, test_size=0.2, random_state=42)

# Initialize the RBM with the right number of visible and hidden units
rbm = RBM(n_movies, 50)

# Train the RBM
rbm.train(training_set)
