In [1]:
import csv
from collections import defaultdict
import numpy as np

In [2]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [3]:
uKeys = ['username', 'userId', 'name', 'tags', 'followers', 'following', 'bio', 'followerCount', 'followCount', 'postCount', 'posts', 'recommends']
pKeys = ["id", "clapCount", "wordCount", "readingTime", "tags", "title", "userId", "time", "subtitle"]

In [4]:
def readCSV(filename, keys):
    objects = []
    
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        
        for row in csv_reader:
            if line_count == -1:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:

                u = {}
                for i,k in enumerate(keys): 
                    u[k] = row[i]
                objects.append(u)

                line_count += 1
                
    print(f'Processed {line_count} lines.')
    return objects

In [5]:
readUsers = readCSV('users.csv', uKeys)
users = defaultdict(dict)

for u in readUsers:
    users[u['userId']] = u

Processed 4131 lines.


In [6]:
readPosts = readCSV('posts.csv', pKeys)
posts = defaultdict(dict)

for p in readPosts:
    posts[p['id']] = p

Processed 25163 lines.


In [7]:
len(users)

3544

In [8]:
len(posts)

20265

In [9]:
import ast

testUsers = list(users.keys())
testPosts_ = [ast.literal_eval(users[u]['recommends']) for u in testUsers]
testPosts = []
for arr in testPosts_:
    for p in arr: testPosts.append(p)
        

In [10]:
import random
testPosts = list(set(testPosts))
random.shuffle(testPosts)
len(testPosts)

11521

In [11]:
matrix = np.zeros((len(testUsers), len(testPosts)))

In [66]:
pairs = []
for i,u in enumerate(testUsers):
    for j,p in enumerate(testPosts):
        if p in ast.literal_eval(users[u]['recommends']):
            matrix[i][j] = 1
            pairs.append((i,j))

In [68]:
### randomly remove some user-article claps
### maintain list and compare with output
len(pairs)
random.shuffle(pairs)
hidden_claps = pairs[int(len(pairs) * 0.9):]

In [69]:
len(hidden_claps)

1355

In [70]:
# hide claps
for (i,j) in hidden_claps:
    matrix[i][j] = 0

In [71]:
class OneClassRecommendation():

    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """

        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.gamma_u = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.gamma_i = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # Create a list of training samples
#         self.samples = [
#             (i, j, self.R[i, j])
#             for i in range(self.num_users)
#             for j in range(self.num_items)
#             if self.R[i, j] > 0
#         ]

        # sample user, article-clapped and article-not-clapped
        self.samples = [] # u, i, j
        for u in range(self.num_users):
            claps = []
            non_claps = []
            
            for i in range(self.num_items):
                if self.R[u][i] == 1:
                    claps.append(i)
                else:
                    non_claps.append(i)
            
            random.shuffle(non_claps)
            non_claps[:len(claps)]
            
            for i in range(len(claps)):
                self.samples.append((u, claps[i], non_claps[i]))

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            gain = self.sga()
#             mse = self.mse()
            training_process.append((i, gain))
            if (i+1) % 10 == 0:
                print("Iteration: %d ; gain = %.4f" % (i+1, gain))

        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def dot(self, K, L):
        """
        A function to compute the dot product of two lists
        """
        if len(K) != len(L):
            return 0
        
        return sum(i[0] * i[1] for i in zip(K, L))
    
    def sga(self):
        """
        Perform stochastic graident ascent
        """
        gain = 0
        
        for u, i, j in self.samples:
            # Compute gradient
            dot_prod = self.dot(self.gamma_u[u], self.gamma_i[i]) - self.dot(self.gamma_u[u], self.gamma_i[j])
            exp_sum = math.exp(-1*dot_prod)
            
            for k in range(self.K):
                gr = (self.gamma_i[i][k] - self.gamma_i[j][k]) * (exp_sum) / (1 + exp_sum)

                # Update biases
                self.b_u[u] += self.alpha * (gr - self.beta * self.b_u[u])
                self.b_i[i] += self.alpha * (gr - self.beta * self.b_i[i])
                self.b_i[j] += self.alpha * (gr - self.beta * self.b_i[j])

                # Update user and item latent feature matrices
                self.gamma_u[u][k] += self.alpha * (gr - self.beta * sum(self.gamma_u[u]))
                self.gamma_i[i][k] += self.alpha * (gr - self.beta * sum(self.gamma_i[i]))
                self.gamma_i[j][k] += self.alpha * (gr - self.beta * sum(self.gamma_i[j]))
            
    
            gain +=  sigmoid(dot_prod)
        
        return gain / len(self.samples)

    def get_rating(self, u, i):
        """
        Get the predicted rating of user u and item i
        """
        prediction = self.b + self.b_u[u] + self.b_i[i] + self.gamma_u[u].dot(self.gamma_i[i].T)
        return prediction 

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.gamma_u.dot(self.gamma_i.T)

In [72]:
mf = OneClassRecommendation(matrix, K=100, alpha=0.1, beta=0.01, iterations=200)

In [73]:
mf

<__main__.OneClassRecommendation at 0x1295ade48>

In [74]:
mf.R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
mf.train()

Iteration: 10 ; gain = 0.5098
Iteration: 20 ; gain = 0.5300
Iteration: 30 ; gain = 0.5543
Iteration: 40 ; gain = 0.5836
Iteration: 50 ; gain = 0.6176
Iteration: 60 ; gain = 0.6546
Iteration: 70 ; gain = 0.6910
Iteration: 80 ; gain = 0.7236
Iteration: 90 ; gain = 0.7512
Iteration: 100 ; gain = 0.7740
Iteration: 110 ; gain = 0.7926
Iteration: 120 ; gain = 0.8081
Iteration: 130 ; gain = 0.8208
Iteration: 140 ; gain = 0.8310
Iteration: 150 ; gain = 0.8391
Iteration: 160 ; gain = 0.8456
Iteration: 170 ; gain = 0.8507


In [None]:
pairs = set(pairs)

In [None]:
err = corr = 0
THRESH = 0.8
for i in range(len(matrix)):
    for j in range(len(matrix[0])):
        if matrix[i][j] == 0: continue
        if matrix[i][j] == 1 and (i,j) not in pairs and mf.get_rating(i, j) > THRESH:
            corr += 1
        else:
            err += 1

In [None]:
# training accuracy
corr / (corr + err)

In [None]:
corr = err = 0
for (i,j) in pairs:
    if mf.get_rating(i,j) > THRESH:
        corr += 1
    else:
        err += 1

corr / (corr + err)

In [29]:
matrix[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
[i for i in range(len(matrix[0])) if matrix[0][i] == 1]

[3990, 5929, 9058]

In [39]:
def train_validate(testUsers, testPosts):
    num_claps = 0
    for u in testUsers:
        for p in testPosts:
            

8.218833184504723