In [None]:
def KMeans_Test(model, df, n, alpha=1.):
    """
    Test the accuracy of a fitted KMeans clustering model.
        To do this we randomly select n movies. For each of these movies
        we replace every non-NAN entry to NAN, then predict the new ratings 
        as the weighted averages of the non-NAN ratings in their KMeans clusters. 
        The weights are the proportion of neighbor ratings for a movie with respect
        to the total number of ratings neighbors have given for all movies the user
        hasn't seen. Then compare these predicted ratings with the actual ratings. 
        Currently using the sum of squared residuals. 
        Change to softmax loss function??
    
    Parameters:
        model (KMeans): The fitted model
        df (DataFrame): The sparse dataframe including NANs that the
                        model was trained on
        n (int): The number of movies to test on
        alpha (float): hyperparameter
    
    Returns:
        score (float): The accuracy of the KMeans clustering
        predictions (list): A flattened list of all predictions made
        actual (list): A flattened list of all the true ratings
    """
    labels = model.labels_
    #Choose indices for the n random movies (sampling with replacement -- Change?)
    movie_ids = df.iloc[:,np.random.randint(0, df.shape[1], 5)].columns.values
    actual, predictions = np.array([]), list()
    for m_id in movie_ids:
        #Find the users that have rated the m_id movie
        m_rated_mask = df.loc[:,m_id].notnull().values.values
        user_ids = df.iloc[m_rated_mask].index.values
        #Grab the ratings and store them in the flattened actual list
        actual = np.concatenate((actual,df.iloc[m_rated_mask].loc[:,m_id].values.values))
        #Find the clusters for each user (df index starts at 1)
        clusters = labels[m_rated_mask]
        #Calculate the predicted ratings as the average of the ratings
        ##of the other users in their clusters. This is different than using the 
        ##cluster centers because the cluster centers used a dataframe with 2.5
        ##filled in for every NaN value
        for i, user_id in enumerate(user_ids):
            #Remove the user_id from the dataframe for cluster comparisons
            #Don't think we need this: temp_df = df[df.index != user_id].copy()
            #Now temp_labels must be adjusted to match the indexing of temp_df
            #Don't think we need this: temp_labels = np.delete(labels, np.where(m_rated_mask)[0][i])
            #Find the ratings for movie m_id given by neighbors
            neighbor_ratings = df.iloc[labels == clusters[i]].loc[:,m_id]
            #Find the neighbors average ratings for all movies they've rated
            neighbor_avgs = df.iloc[labels == clusters[i]].mean(1)
            #Find the user's average rating
            user_avg = df.loc[user_id].mean()
            #Calculate the number of ratings this user's neighbors have given
            ##for every movie that this user hasn't seen
            #First find the column index for the movie we're considering
            within_cluster_index = np.where(df.iloc[labels==clusters[i]].columns.values == m_id)[0][0]
            #Include the movie we're testing to the mask of movies our user hasn't seen
            not_seen = np.concatenate((df.iloc[labels==clusters[i]].loc[user_id].isnull().values,[within_cluster_index]))
            #Calculate the number of ratings neighbors have given for these movies
            num_ratings = df.iloc[labels == clusters[i]].iloc[:,not_seen].notnull().sum()
            #print("Num ratings:", num_ratings.loc[m_id])
            total = num_ratings.sum()
            share = (num_ratings/total).loc[m_id]
            predict = round(2*(user_avg + (neighbor_ratings - neighbor_avgs).mean()*alpha*share))/2
            #If no one else in the cluster has seen the movie, predict will be NaN
            ##In this case, replace predict with the user's average movie-rating
            if np.isnan(predict):
                predict = round(2*user_avg)/2

            predictions.append(predict)
            

    return np.sum((np.array(predictions) - actual)**2), predictions, actual