In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle

In [None]:
data=pd.read_csv("/content/drive/MyDrive/data/data1.csv")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,users,movieId,rating
0,1,9,1,1
1,2,12,1,1
2,3,10,1,-1
3,4,13,1,1
4,5,14,1,1


In [None]:
data=data.iloc[:, 1:]

In [None]:
TRAIN_SIZE = 0.75
ratings = shuffle(data, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [None]:
##### CF 추천 알고리즘 >>>>>>>>>>>>>>>

rating_matrix = ratings_train.pivot(index='users', columns='movieId', values='rating')

# train set 사용자들의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# train 데이터의 user의 rating 평균과 영화의 평점편차 계산 
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id]
        movie_ratings = rating_bias[movie_id]
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction


In [None]:
##### MF 추천 알고리즘 >>>>>>>>>>>>>>>

class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    # train set의 RMSE 계산
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Ratings for user i and item j
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Test set을 선정
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x = self.user_id_index[ratings_test.iloc[i, 0]]
            y = self.item_id_index[ratings_test.iloc[i, 1]]
            z = ratings_test.iloc[i, 2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    # Test set의 RMSE 계산
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # Training 하면서 test set의 정확도를 계산
    def test(self):
        # Initializing user-feature and item-feature matrix
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i+1, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))
        return training_process

    # Ratings for given user_id and item_id
    def get_one_prediction(self, user_id, item_id):
        prediction = self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])
        return prediction

    # Full user-movie rating matrix
    def full_prediction(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)

# MF클래스 생성 및 학습
R_temp = ratings.pivot(index='users', columns='movieId', values='rating').fillna(0)
mf = NEW_MF(R_temp, K=200, alpha=0.001, beta=0.02, iterations=250, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()


Iteration: 10 ; Train RMSE = 0.8609 ; Test RMSE = 0.7942
Iteration: 20 ; Train RMSE = 0.8380 ; Test RMSE = 0.8002
Iteration: 30 ; Train RMSE = 0.8255 ; Test RMSE = 0.8046
Iteration: 40 ; Train RMSE = 0.8173 ; Test RMSE = 0.8070
Iteration: 50 ; Train RMSE = 0.8115 ; Test RMSE = 0.8082
Iteration: 60 ; Train RMSE = 0.8073 ; Test RMSE = 0.8089
Iteration: 70 ; Train RMSE = 0.8041 ; Test RMSE = 0.8093
Iteration: 80 ; Train RMSE = 0.8017 ; Test RMSE = 0.8096
Iteration: 90 ; Train RMSE = 0.7998 ; Test RMSE = 0.8100
Iteration: 100 ; Train RMSE = 0.7984 ; Test RMSE = 0.8104
Iteration: 110 ; Train RMSE = 0.7972 ; Test RMSE = 0.8108
Iteration: 120 ; Train RMSE = 0.7961 ; Test RMSE = 0.8111
Iteration: 130 ; Train RMSE = 0.7953 ; Test RMSE = 0.8115
Iteration: 140 ; Train RMSE = 0.7945 ; Test RMSE = 0.8118
Iteration: 150 ; Train RMSE = 0.7937 ; Test RMSE = 0.8122
Iteration: 160 ; Train RMSE = 0.7929 ; Test RMSE = 0.8125
Iteration: 170 ; Train RMSE = 0.7921 ; Test RMSE = 0.8127
Iteration: 180 ; Train 

In [None]:
##### Hybrid 추천 알고리즘

def recommender0(recomm_list, mf):
    recommendations = np.array([mf.get_one_prediction(user, movie) for (user, movie) in recomm_list])
    return recommendations

def recommender1(recomm_list, neighbor_size=0):
    recommendations = np.array([CF_knn_bias(user, movie, neighbor_size) for (user, movie) in recomm_list])
    return recommendations

recomm_list = np.array(ratings_test.iloc[:, [0, 1]])
predictions0 = recommender0(recomm_list, mf)
RMSE2(ratings_test.iloc[:, 2], predictions0)
predictions1 = recommender1(recomm_list, 37)
RMSE2(ratings_test.iloc[:, 2], predictions1)

weight = [0.8, 0.2]
predictions = predictions0 * weight[0] + predictions1 * weight[1]
RMSE2(ratings_test.iloc[:, 2], predictions)

for i in np.arange(0, 1, 0.01):
    weight = [i, 1.0 - i]
    predictions = predictions0 * weight[0] + predictions1 * weight[1]
    print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], 
           weight[1], RMSE2(ratings_test.iloc[:, 2], predictions)))


Weights - 0.00 : 1.00 ; RMSE = 0.7492532
Weights - 0.01 : 0.99 ; RMSE = 0.7493927
Weights - 0.02 : 0.98 ; RMSE = 0.7495428
Weights - 0.03 : 0.97 ; RMSE = 0.7497035
Weights - 0.04 : 0.96 ; RMSE = 0.7498746
Weights - 0.05 : 0.95 ; RMSE = 0.7500563
Weights - 0.06 : 0.94 ; RMSE = 0.7502485
Weights - 0.07 : 0.93 ; RMSE = 0.7504511
Weights - 0.08 : 0.92 ; RMSE = 0.7506643
Weights - 0.09 : 0.91 ; RMSE = 0.7508880
Weights - 0.10 : 0.90 ; RMSE = 0.7511221
Weights - 0.11 : 0.89 ; RMSE = 0.7513667
Weights - 0.12 : 0.88 ; RMSE = 0.7516218
Weights - 0.13 : 0.87 ; RMSE = 0.7518873
Weights - 0.14 : 0.86 ; RMSE = 0.7521632
Weights - 0.15 : 0.85 ; RMSE = 0.7524495
Weights - 0.16 : 0.84 ; RMSE = 0.7527463
Weights - 0.17 : 0.83 ; RMSE = 0.7530535
Weights - 0.18 : 0.82 ; RMSE = 0.7533710
Weights - 0.19 : 0.81 ; RMSE = 0.7536989
Weights - 0.20 : 0.80 ; RMSE = 0.7540372
Weights - 0.21 : 0.79 ; RMSE = 0.7543858
Weights - 0.22 : 0.78 ; RMSE = 0.7547447
Weights - 0.23 : 0.77 ; RMSE = 0.7551140
Weights - 0.24 :