In [1]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.io as sio
import sklearn.metrics as metrics
import csv

%matplotlib inline

In [2]:
data = scipy.io.loadmat("data_hw6_cs189_fa16/joke_data/joke_train.mat")
train_R = data['train']
validation_set = np.loadtxt("data_hw6_cs189_fa16/joke_data/validation.txt", delimiter = ",")
validation_idx = validation_set[:,:-1]
validation_R = validation_set[:,-1]

In [3]:
print(train_R.shape)

(24983, 100)


In [4]:
zeroed_R = np.nan_to_num(train_R)
print(train_R[0][4])

nan


In [5]:
def train_svd(train_R, d):
    U, s, V = np.linalg.svd(train_R, full_matrices=False)
    new_s = [0] * len(s)
    new_s[:d] = s[:d]
    S = np.diag(new_s)
    return np.matmul(np.matrix(U), np.matrix(np.sqrt(S))), np.matmul(np.matrix(np.sqrt(S)), (np.matrix(V)))

In [6]:
def predict(U, X, indices):
    predictions = []
    for user, joke in indices:
        rate = U[user - 1].dot(X.T[joke-1].T)
        if rate > 0:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [7]:
def mse(U, X, R):
    result = 0
    prediction = np.matmul(U, X)
    k,d = prediction.shape
    assert prediction.shape == R.shape
    for i in range(k):
        for j in range(d):
            if not np.isnan(R[i][j]):
                result += (prediction[i,j]-R[i][j])**2
    return result

In [8]:
def validate(new_R, train_R, d):
    U, X = train_svd(new_R, d)
    error = mse(U, X, train_R)
    print("when d = ",d,", MSE is ", error)
    prediction = predict(U, X, validation_idx)
    validation_score = metrics.accuracy_score(validation_R, prediction)
    print("Validation accuracy: {0}".format(validation_score))

# 4.2.2

## d = 2

In [9]:
validate(zeroed_R, train_R, 2)

when d =  2 , MSE is  18441623.0179
Validation accuracy: 0.7051490514905149


## d = 5

In [10]:
validate(zeroed_R, train_R, 5)

when d =  5 , MSE is  16333384.4202
Validation accuracy: 0.7154471544715447


## d = 10

In [11]:
validate(zeroed_R, train_R, 10)

when d =  10 , MSE is  14165432.758
Validation accuracy: 0.7165311653116531


## d = 20

In [12]:
validate(zeroed_R, train_R, 20)

when d =  20 , MSE is  11304007.4397
Validation accuracy: 0.6859078590785908


# 4.2.3


In [13]:
def new_gradient(train_R, alpha = 0.01, max_iter = 10, dim = 10, svd_U = None, svd_V = None):
    k,d = train_R.shape
    U = np.random.normal(size=(k, dim))
    V = np.random.normal(size=(d, dim))
    if svd_U != None:
        U = svd_U
    if svd_V != None:
        V = svd_V
    for iteration in range(max_iter):
        for i in range(k):
#             U[i] = train_R[i].dot(V).dot(np.linalg.inv(V.T.dot(V) + alpha * np.identity(d)))
#             print(V.T.dot(train_R[i].T))
#             print((np.linalg.inv(V.T.dot(V)+ alpha * np.identity(d))).shape)
            U[i] = (np.linalg.inv(V.T.dot(V) + alpha * np.identity(dim))).dot(V.T).dot(train_R[i].T).T
        for j in range(d):
#             V[j] = train_R.T[j].dot(U).dot(np.linalg.inv(U.T.dot(U) + alpha * np.identity(d)))
            V[j] = (np.linalg.inv(U.T.dot(U) + alpha * np.identity(dim))).dot(U.T).dot(train_R.T[j].T).T
    return U, V

In [16]:
def new_update(train_R, alpha = 0.01, max_iter = 10, dim = 10, svd_U = None, svd_V = None):
    U, X = new_gradient(train_R,alpha = alpha, max_iter = max_iter)
    error = mse(U, X.T, train_R)
    print("MSE is ", error)
    prediction = predict(U, X.T, validation_idx)
    validation_score = metrics.accuracy_score(validation_R, prediction)
    print("Validation accuracy: {0}".format(validation_score))
    return U,X

## d = 2

In [17]:
d = 2
U, V = train_svd(zeroed_R, d)
U2, X2 = new_update(zeroed_R, alpha = 1.5, max_iter = 100, dim = d, svd_U= U, svd_V = V)

MSE is  17415318.6796
Validation accuracy: 0.7168021680216802




## d = 5

In [42]:
d = 5
U, V = train_svd(zeroed_R, d)
U5, X5 = new_update(zeroed_R, alpha = 1, max_iter = 100, dim = d, svd_U= U, svd_V = V)

MSE is  17415308.0412
Validation accuracy: 0.7165311653116531




## d = 10

In [37]:
d = 10
U, V = train_svd(zeroed_R, d)
U10, V10 = new_update(zeroed_R, alpha = 1, max_iter = 100, dim = d, svd_U= U, svd_V = V)

MSE is  17415308.7834
Validation accuracy: 0.7168021680216802




## d = 20

In [38]:
d = 20
U, V = train_svd(zeroed_R, d)
U20, V20 = new_update(zeroed_R, alpha = 1, max_iter = 100, dim = d, svd_U= U, svd_V = V)

MSE is  17415402.4997
Validation accuracy: 0.7173441734417344




## d = 50

In [44]:
d = 50
U, V = train_svd(zeroed_R, d)
U50, V50 = new_update(zeroed_R, alpha = 2, max_iter = 100, dim = d, svd_U= U, svd_V = V)

MSE is  17415323.5534
Validation accuracy: 0.7165311653116531




# 4.3

In [41]:
query_data = np.loadtxt("data_hw6_cs189_fa16/joke_data/query.txt", delimiter = ",")
query_idx = query_data[:,1:]
query_id = query_data[:,0]
predictions = predict(U20, V20.T, query_idx)
assert len(predictions) == len(query_id)
with open('wenjing_kang.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Id','Category'])
    for i in range(len(query_id)):
        writer.writerow([int(query_id[i]), int(predictions[i])])

