In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime



Installed autotime.py. To use it, type:
  %load_ext autotime


# Problem 1 (K-means)

In [None]:
np.random.seed(42)

N = 500
weights = [0.2, 0.5, 0.3]
mean = np.array([[0,0], [3,0], [0,3]])
cov = np.array([[1,0],[0,1]])

data =  np.empty((0,2), int)
index = np.random.choice([0,1,2], N, p= weights)

for i in index:
    sample = np.random.multivariate_normal(mean[index[i]], cov, 1)
    data = np.append(data, sample, axis=0)

In [None]:
plt.scatter(data[:,0],data[:,1])

### K-means funcitons

In [None]:
def initialize_centroids(data, k):
    """returns k centroids from the initial points"""
    centroids = data.copy()
    np.random.shuffle(centroids)
    return centroids[:k]

In [None]:
def kmeans(data, k, centroids, iteration):

    L = []
    centroids_array = np.zeros([0,2])
    for i in range(iteration+1):
        distance = np.zeros([data.shape[0],])
        n = np.zeros([1,])

        for i in range(k):
            d = np.linalg.norm((data - centroids[i,:]), axis=1)
            distance = np.vstack((distance, d))
        distance = np.delete(distance, (0), axis=0)

        for i in np.arange(500):
            a = np.where(distance[:,i] == distance[:,i].min())[0]
            n = np.vstack((n,a))
        n = np.delete(n, (0), axis=0)
        n = n.reshape(500,)

        centroids = np.array([data[n==k].mean(axis=0) for k in range(centroids.shape[0])])
#         centroids_array = np.vstack((centroids_array, centroids))

        L.append(np.sum([np.sum(np.linalg.norm((data[n==k] - centroids[k]), axis=1), axis=0) for k in range(centroids.shape[0])]))
        
    return n, centroids, L

In [None]:
centroids = initialize_centroids(data, 5)

clusters_5, centroids_poisition_5, L_5 = kmeans(data,5, centroids, 20)
clusters_4, centroids_poisition_4, L_4 = kmeans(data,4, centroids, 20)
clusters_3, centroids_poisition_3, L_3 = kmeans(data,3, centroids, 20)
clusters_2, centroids_poisition_2, L_2 = kmeans(data,2, centroids, 20)

## 1-a

In [None]:
plt.figure(num=None, figsize=(10, 10), dpi=80)

plt.plot(L_5, label="5 clusters", linewidth=5)
plt.plot(L_4, label="4 clusters", linewidth=5)
plt.plot(L_3, label="3 clusters", linewidth=5)
plt.plot(L_2, label="2 clusters", linewidth=5)

plt.xlabel("iteration")
plt.ylabel("L")
plt.legend(loc=1)

## 1-b

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))
figsize=(15, 10)
ax[1].scatter(data[:, 0], data[:, 1], c=clusters_5)
ax[1].scatter(centroids[:, 0], centroids_poisition_5[:, 1], c='r', s=100)

ax[0].scatter(data[:, 0], data[:, 1], c=clusters_3)
ax[0].scatter(centroids[:, 0], centroids_poisition_3[:, 1], c='r', s=100)
fig.tight_layout()

# Problem 2 (Matrix factorization)

In [2]:
#load train data as numpy array
X = np.genfromtxt('/Users/Amiros/GitHub/Machine Learning for Data Science/hw4-kmeans/data/ratings.csv', 
          delimiter=',')
y = np.genfromtxt('/Users/Amiros/GitHub/Machine Learning for Data Science/hw4-kmeans/data/ratings_test.csv', 
          delimiter=',')

time: 629 ms


In [3]:
# X_sorted = X[X[:,0].argsort()]
# y_sorted = y[y[:,0].argsort()]

time: 21.5 ms


In [4]:
# problem intialization
d = 10
lmbda = 1
variance = 0.25
steps = 100

time: 1.77 ms


In [5]:
# update user location

# number of users
N_1 = np.unique(X_sorted[:,0]).shape[0]
# number of movies
N_2 = np.unique(X_sorted[:,1]).shape[0]

time: 14.5 ms


In [6]:
u = np.random.randn(N_1, d)
v = np.random.randn(N_2, d)

# the index set of objects rated by user i
movies_for_each_u = [np.where(i==X[:,0]) for i in np.unique(X[:,0])]

objects_rated_by_u = []
for i in range(N_1):
    d = {
        'user_id': i+1,
        'rated_movies': np.take(X[:,1], movies_for_each_u[i][0])
    }
    objects_rated_by_u.append(d)
    
# the index set of users who rated object j
users_for_each_v = [np.where(i==X_sorted[:,1]) for i in np.unique(X_sorted[:,1])]

users_who_rated_v = []
for i in range(N_2):
    d = {
        'movie_id': i+1,
        'users_who_rated': np.take(X[:,1], users_for_each_v[i][0])
    }
    users_who_rated_v.append(d)

time: 4.11 ms


In [7]:
np.linalg.inv((lmbda * variance * np.eye(d)) + v[0,:].dot(v[0,:].T))

array([[ 3.60242139, -0.39757861, -0.39757861, -0.39757861, -0.39757861,
        -0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861],
       [-0.39757861,  3.60242139, -0.39757861, -0.39757861, -0.39757861,
        -0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861],
       [-0.39757861, -0.39757861,  3.60242139, -0.39757861, -0.39757861,
        -0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861],
       [-0.39757861, -0.39757861, -0.39757861,  3.60242139, -0.39757861,
        -0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861],
       [-0.39757861, -0.39757861, -0.39757861, -0.39757861,  3.60242139,
        -0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861],
       [-0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861,
         3.60242139, -0.39757861, -0.39757861, -0.39757861, -0.39757861],
       [-0.39757861, -0.39757861, -0.39757861, -0.39757861, -0.39757861,
        -0.39757861,  3.60242139, -0.39757861

time: 10.8 ms


In [33]:
a = objects_rated_by_u[0]['rated_movies']

numpy.ndarray

time: 10.6 ms


time: 418 ms


[{'movie_id': 1,
  'users_who_rated': array([  2.08000000e+02,   2.37000000e+02,   5.05000000e+02,
           6.90000000e+01,   1.02200000e+03,   9.05000000e+02,
           3.13000000e+02,   1.83000000e+02,   4.82000000e+02,
           9.10000000e+01,   2.04000000e+02,   1.63000000e+02,
           5.29000000e+02,   5.19000000e+02,   4.27000000e+02,
           6.85000000e+02,   7.42000000e+02,   2.07000000e+02,
           2.69000000e+02,   4.26000000e+02,   2.09000000e+02,
           7.22000000e+02,   8.31000000e+02,   2.16000000e+02,
           3.57000000e+02,   5.44000000e+02,   1.87000000e+02,
           8.49000000e+02,   1.32000000e+02,   3.82000000e+02,
           4.33000000e+02,   9.00000000e+02,   1.83000000e+02,
           1.20000000e+02,   1.82000000e+02,   7.35000000e+02,
           4.50000000e+02,   6.90000000e+01,   9.88000000e+02,
           9.19000000e+02,   5.91000000e+02,   3.33000000e+02,
           1.64000000e+02,   5.97000000e+02,   7.39000000e+02,
           1.110000

time: 2.24 s
