In [19]:
import csv
from collections import defaultdict
import numpy as np

In [20]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [21]:
uKeys = ['username', 'userId', 'name', 'tags', 'followers', 'following', 'bio', 'followerCount', 'followCount', 'postCount', 'posts', 'recommends']
pKeys = ["id", "clapCount", "wordCount", "readingTime", "tags", "title", "userId", "time", "subtitle"]

In [22]:
def readCSV(filename, keys):
    objects = []
    
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        
        for row in csv_reader:
            if line_count == -1:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:

                u = {}
                for i,k in enumerate(keys): 
                    u[k] = row[i]
                objects.append(u)

                line_count += 1
                
    print(f'Processed {line_count} lines.')
    return objects

In [23]:
readUsers = readCSV('users.csv', uKeys)
users = defaultdict(dict)

for u in readUsers:
    users[u['userId']] = u

Processed 2894 lines.


In [24]:
readPosts = readCSV('posts.csv', pKeys)
posts = defaultdict(dict)

for p in readPosts:
    posts[p['id']] = p

Processed 16876 lines.


In [25]:
len(users)

2489

In [26]:
posts['f12baa81d5c6']

{'id': 'f12baa81d5c6',
 'clapCount': '32',
 'wordCount': '2499',
 'readingTime': '10.263522012578617',
 'tags': "['Public Speaking', 'Speaking', 'Comfort Zone', 'Mentorship', 'Ignite']",
 'title': 'Growth Happens Outside Our Comfort Zone: How I Prepared to Speak in Front of 500+ People & Why I’m…',
 'userId': 'b2d986206d08',
 'time': '1475463567034',
 'subtitle': 'A few months ago I stood on stage at Town Hall Seattle and delivered a five-minute talk at IGNITE Seattle #30 to more than 500 people.'}

In [43]:
import ast

testUsers = list(users.keys())
testPosts_ = [ast.literal_eval(users[u]['recommends']) for u in testUsers]
testPosts = []
for arr in testPosts_:
    for p in arr: testPosts.append(p)
        

In [44]:
import random
testPosts = list(set(testPosts))
random.shuffle(testPosts)
len(testPosts)

7561

In [45]:
matrix = np.zeros((len(testUsers), len(testPosts)))
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [46]:
for i,u in enumerate(testUsers):
    for j,p in enumerate(testPosts):
        if p in ast.literal_eval(users[u]['recommends']):
            matrix[i][j] = 1
            print(i,j)
#         else:
#             matrix[i][j] = 0

matrix

0 2582
0 4080
0 6395
1 338
1 1606
1 2036
1 3110
1 6259
1 6907
2 831
2 3026
2 3144
2 3684
2 5520
2 5767
2 6764
2 7284
2 7542
5 1026
6 3871
6 4025
6 4524
6 5132
6 5442
6 6383
6 6451
6 6855
6 6888
7 1176
7 1437
7 2083
7 2410
7 2666
7 5423
7 5442
7 5503
7 5656
7 7203
11 6461
12 44
12 1073
12 1649
12 3271
12 4626
12 5383
12 5709
12 6755
12 6987
12 7516
14 263
14 6260
16 1136
16 2053
16 3794
16 5072
16 5224
16 5808
16 6016
16 6705
16 6728
16 7060
17 260
17 1629
17 2743
17 4326
17 7063
18 2838
18 4386
18 5238
19 663
19 1595
19 3411
19 3689
19 4357
19 4415
19 5147
19 6278
19 6833
20 643
20 2488
20 3989
20 4557
20 5254
20 5677
21 348
21 380
21 3070
21 3509
21 3669
21 3899
21 4559
21 5659
21 6836
21 7411
22 521
22 1756
22 2155
22 3822
22 4764
22 5594
22 5613
22 6646
22 6810
23 2147
27 1971
27 2952
27 3351
27 3655
27 3755
27 3914
27 4668
27 4674
27 6840
28 1506
28 5362
29 110
29 1102
29 1186
29 1212
29 1589
29 3501
29 5863
29 6379
29 7156
31 4696
32 867
32 2313
32 2332
32 2789
32 2874
32 3963
32 

KeyboardInterrupt: 

In [47]:
users[testUsers[8]]['recommends']

'[]'

In [48]:
import ast
testPosts = [ast.literal_eval(users[u]['recommends']) for u in testUsers]

In [49]:
testPosts

[['4ccbfbb14314', 'e1b39c8c38ea', '7898ab6bc23e'],
 ['b6ffa82198ee',
  '299fa795d710',
  '5bf9c3b2ec4c',
  'afbed0679311',
  '79104e62e930',
  '5ecd3df24f58'],
 ['ca044cdb7715',
  'db78356063f2',
  '90c75eb7c5b0',
  'c72b1235a564',
  '7e7d2094b77f',
  '3bd8da6f3504',
  '3fe4bf63aa67',
  '98dc3473f085',
  '68a4fa399352'],
 [],
 [],
 ['426f8210933c'],
 ['5c42dab36b6e',
  'c9159b626dd8',
  '89ddec30b01b',
  '36dd12db89ec',
  '2c5966ef720b',
  '686939cb4c3a',
  '7565780ab5b1',
  'f7da3f434723',
  'abc4e6df799a'],
 ['7fa12c31c59d',
  '394a630c75b7',
  '697fa2ca3ddf',
  'd93539b06966',
  '995e1803d88',
  'd86b1786fbbe',
  '68dc8da50b7c',
  'abc4e6df799a',
  'f85b151e45c3',
  '82a7d01441d1'],
 [],
 [],
 [],
 ['3417048ccde2'],
 ['cfe901591f88',
  '602339175087',
  '786de2885187',
  '297ace2e864f',
  '1ed8b6db489d',
  '81385bfd4b4e',
  'fddac6607d35',
  '17a1ae7fc5d3',
  '8816bc5ce8f0',
  '8189e8d5e760'],
 [],
 ['e558ac991513', '235adb58ce40'],
 [],
 ['c719add3ba8c',
  'fe29e3d5b6e2',
  'f5470b

In [77]:
class OneClassRecommendation():

    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """

        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.gamma_u = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.gamma_i = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # Create a list of training samples
#         self.samples = [
#             (i, j, self.R[i, j])
#             for i in range(self.num_users)
#             for j in range(self.num_items)
#             if self.R[i, j] > 0
#         ]

        # sample user, article-clapped and article-not-clapped
        self.samples = [] # u, i, j
        for u in range(self.num_users):
            claps = []
            non_claps = []
            
            for i in range(self.num_items):
                if self.R[u][i] == 1:
                    claps.append(i)
                else:
                    non_claps.append(i)
            
            random.shuffle(non_claps)
            non_claps[:len(claps)]
            
            for i in range(len(claps)):
                self.samples.append((u, claps[i], non_claps[i]))

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            gain = self.sga()
#             mse = self.mse()
            training_process.append((i, gain))
            if (i+1) % 10 == 0:
                print("Iteration: %d ; gain = %.4f" % (i+1, gain))

        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def dot(self, K, L):
        """
        A function to compute the dot product of two lists
        """
        if len(K) != len(L):
            return 0
        
        return sum(i[0] * i[1] for i in zip(K, L))
    
    def sga(self):
        """
        Perform stochastic graident ascent
        """
        gain = 0
        
        for u, i, j in self.samples:
            # Compute gradient
            dot_prod = self.dot(self.gamma_u[u], self.gamma_i[i]) - self.dot(self.gamma_u[u], self.gamma_i[j])
            exp_sum = math.exp(-1*dot_prod)
            
            for k in range(self.K):
                gr = (self.gamma_i[i][k] - self.gamma_i[j][k]) * (exp_sum) / (1 + exp_sum)

                # Update biases
                self.b_u[u] += self.alpha * (gr - self.beta * self.b_u[u])
                self.b_i[i] += self.alpha * (gr - self.beta * self.b_i[i])
                self.b_i[j] += self.alpha * (gr - self.beta * self.b_i[j])

                # Update user and item latent feature matrices
                self.gamma_u[u][k] += self.alpha * (gr - self.beta * sum(self.gamma_u[u]))
                self.gamma_i[i][k] += self.alpha * (gr - self.beta * sum(self.gamma_i[i]))
                self.gamma_i[j][k] += self.alpha * (gr - self.beta * sum(self.gamma_i[j]))
            
    
            gain +=  sigmoid(dot_prod)
        
        return gain / len(self.samples)

    def get_rating(self, u, i):
        """
        Get the predicted rating of user u and item i
        """
        prediction = self.b + self.b_u[u] + self.b_i[i] + self.gamma_u[u].dot(self.gamma_i[i].T)
        return prediction 

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.gamma_u.dot(self.gamma_i.T)

In [86]:
mf = OneClassRecommendation(matrix, K=100, alpha=0.1, beta=0.01, iterations=100)

In [87]:
mf

<__main__.OneClassRecommendation at 0x11a8ecef0>

In [88]:
mf.R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [89]:
mf.train()

Iteration: 10 ; gain = 0.5032
Iteration: 20 ; gain = 0.5120
Iteration: 30 ; gain = 0.5212
Iteration: 40 ; gain = 0.5336
Iteration: 50 ; gain = 0.5509
Iteration: 60 ; gain = 0.5706
Iteration: 70 ; gain = 0.5876
Iteration: 80 ; gain = 0.5999
Iteration: 90 ; gain = 0.6087
Iteration: 100 ; gain = 0.6153


[(0, 0.5000004606858549),
 (1, 0.5002534231165138),
 (2, 0.500509355145622),
 (3, 0.5007715273248616),
 (4, 0.5010459145772601),
 (5, 0.5013436153713209),
 (6, 0.5016773391976393),
 (7, 0.5020697193256188),
 (8, 0.502553087645362),
 (9, 0.5031673706650254),
 (10, 0.5039345270452907),
 (11, 0.5048278275318062),
 (12, 0.5057843250850524),
 (13, 0.5067387430546624),
 (14, 0.507655712189539),
 (15, 0.5085425103135224),
 (16, 0.5094157544796512),
 (17, 0.5102870959619757),
 (18, 0.5111457571746825),
 (19, 0.5120049155965708),
 (20, 0.5128558978445442),
 (21, 0.5137099006304011),
 (22, 0.51457257631835),
 (23, 0.5154484226605635),
 (24, 0.5163483121166514),
 (25, 0.517260876546104),
 (26, 0.5182062866132217),
 (27, 0.5191592440408508),
 (28, 0.5201594043154192),
 (29, 0.5211788970262629),
 (30, 0.5222420030792836),
 (31, 0.5233235974010332),
 (32, 0.5244612207295818),
 (33, 0.5256361392574089),
 (34, 0.5268366268533926),
 (35, 0.5280984794191882),
 (36, 0.5294047004425901),
 (37, 0.530743358

In [None]:
mf.get_rating(0,9)

In [56]:
matrix[0][:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
for i in range(len(matrix)):
    for j in range(len(matrix[0])):
        if mf.get_rating(i,j) < 0.9:
            print(mf.get_rating(i,j))