In [6]:
from google.colab import drive
drive.mount('/content/drive')
import csv
import scipy.spatial
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import time
import multiprocessing as mp

movies_csv_path = '/content/drive/My Drive/CS 145 Group Project/Data/Original/movies.csv'
genome_scores_path = '/content/drive/My Drive/CS 145 Group Project/Data/Original/genome-scores.csv'
val_ratings_binary_path = '/content/drive/My Drive/CS 145 Group Project/Data/Original/val_ratings_binary.csv'
train_ratings_binary_path = '/content/drive/My Drive/CS 145 Group Project/Data/Original/train_ratings_binary.csv'
test_ratings_path = '/content/drive/My Drive/CS 145 Group Project/Data/Original/test_ratings.csv'

def getGenres():
    with open(movies_csv_path, encoding='utf-8') as csv_file:
        genres = []
        csv_reader = csv.reader(csv_file,delimiter =',')
        for row in csv_reader:
            for genre in row[2].split('|'):
                if genre not in genres:
                    genres.append(genre)
        genres.sort()
        for genre in genres:
            if ("genres" in genre):
                genres.remove(genre)
        return genres

def buildMovieInfo():
    tempDict = {}
    genres = getGenres()
    with open(movies_csv_path, encoding = 'utf-8') as csv_file:
        csv_reader = csv.reader(csv_file,delimiter = ',')
        next(csv_reader)
        for row in csv_reader:
            tempDict[row[0]]=[0]*2
            tempList = [0]*19
            for genre in row[2].split('|'):
                if("genres" not in genre):
                    tempList[genres.index(genre)] = 1
            tempDict[row[0]][0] = tempList
    with open(genome_scores_path, encoding = 'utf-8') as genome_scores:
        genome_scores_reader = csv.reader(genome_scores,delimiter = ',')
        next(genome_scores_reader)
        currentTag = 1
        scores_list = [0]*1128 #num tags
        for row in genome_scores_reader:
            scores_list[int(row[1])-1] = float(row[2])
            if(currentTag == 1127):
                row = next(genome_scores_reader)
                scores_list[int(row[1])-1] = float(row[2])
                tempDict[row[0]][1] = scores_list
                scores_list = [0]*1128
                currentTag=0
            currentTag +=1
    return tempDict
                            
def getDistanceMovies(movieA, movieB, movieDict):
    aGenres = movieDict[movieA][0]
    bGenres = movieDict[movieB][0]
    aTags = movieDict[movieA][1]
    bTags = movieDict[movieB][1]
    distance = scipy.spatial.distance.cosine(aGenres,bGenres)
    distance =distance +scipy.spatial.distance.cosine(aTags,bTags)
    return distance

def getKNNMovies(currentMovie, k, movieDict):
    with open(movies_csv_path, encoding = 'utf-8') as csv_file:
        csv_reader = csv.reader(csv_file,delimiter = ',')
        next(csv_reader)#skip the header row
        similarIDs = []
        for row in csv_reader:
            if(row[0]!= currentMovie):
                dist= getDistanceMovies(currentMovie,row[0],movieDict)
                similarIDs.append((int(row[0]),dist))
        similarIDs.sort(key = lambda x: x[1])
        return [tup[0] for tup in similarIDs[:k]]

def cur_time_millis():
    return time.time()*1000.0

def print_t(msg):
    print(msg + " (time " + str(cur_time_millis()) + ")")

def getDistanceUsers(userA, userB, ratings_matrix):
    distance = scipy.spatial.distance.cosine(ratings_matrix.iloc[userA].values.tolist(), ratings_matrix.iloc[userB].values.tolist())
    return distance

def prepareUserInfo():
    print_t("Reading in movies.csv...")
    df_m = pd.read_csv(movies_csv_path, sep=',', names=['movieId', 'title', 'genres'], skiprows=1)

    print_t("Description of movies.csv:")
    print(df_m.describe())

    print_t("Reading in ratings csv file...")
    df_vrb = pd.read_csv(train_ratings_binary_path, sep=',', names=['userId', 'movieId', 'rating'], skiprows=1)

    print_t("Description of ratings csv file:")
    print(df_vrb.describe())

    print_t("Combining movies and ratings csv tables...")
    df_combined = pd.merge(df_m, df_vrb, on='movieId')
    df_combined.set_index('userId', inplace=True)
    df_combined.sort_index(inplace=True)
    df_combined[['rating']] = df_combined[['rating']].apply(pd.to_numeric)

    print_t("Before group by:")
    print(df_combined)

    aggregation_functions = {'rating': 'sum', 'movieId': 'first', 'title': 'first'}
    df_combined = df_combined.groupby(['userId','genres']).aggregate(aggregation_functions).reset_index()

    print_t("After group by:")
    print(df_combined)

    ratings_matrix = df_combined.pivot(index='userId', columns='genres', values='rating')
    ratings_matrix.replace(np.nan, 0, inplace=True)

    print_t("After pivot:")
    print(ratings_matrix)

    return ratings_matrix

def getKNNUsers(userId, ratings_matrix, k):
    similarIDs = []
    i = 0
    while i < len(ratings_matrix):
        currentUserId = i + 1
        if (currentUserId != userId):
            dist = getDistanceUsers(userId, currentUserId-1, ratings_matrix)
            similarIDs.append([currentUserId, dist])
        i += 1
    similarIDs.sort(key = lambda x: x[1])
    return [tup[0] for tup in similarIDs[:k]]

def KNN(k, n):
  with open('/content/drive/My Drive/CS 145 Group Project/Data/Original/realsubmission.csv', mode = 'w') as submissionFile:
    print_t("Parsing data...")
    csv_writer = csv.writer(submissionFile, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
    df_test_cases = pd.read_csv(test_ratings_path, sep = ',', names = ['userId', 'movieId'], skiprows = 1)
    movieDict = buildMovieInfo()     
    userInfo = prepareUserInfo()
    training = pd.read_csv(train_ratings_binary_path)
    print_t("Finished parsing data, beginning KNN calculations...")
    getKNNMovies('1', 5, movieDict)
    print_t("Finished KNN calculations")
    i = 0
    while (i < len(df_test_cases)):
         UID = df_test_cases.iloc[i, 0]
         MID = df_test_cases.iloc[i, 1]
         print("Current UID " + str(UID) + ", Current MID " + str(MID))
         KNNMovies = getKNNMovies(str(int(MID)), n, movieDict)
         KNNUsers = getKNNUsers(UID, userInfo, k)
         #now get the ratings of how the KNNUsers rated the KNNMovies
         df_filtered = training[training.userId.isin(KNNUsers)] 
         df_filtered = df_filtered[df_filtered.movieId.isin(KNNMovies)]
         if(len(df_filtered) == 0):
           print("errored")
           print(i)#print the testcase for debugging 
           csv_writer.writerow([i, 1])
         else:
           mean = df_filtered['rating'].mean()
           prediction = int(round(mean))
           csv_writer.writerow([i, prediction])
         i+=1

KNN(5, 5)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Parsing data... (time 1575693617673.0244)
Reading in movies.csv... (time 1575693637137.8416)
Description of movies.csv: (time 1575693637174.4944)
             movieId
count   27278.000000
mean    59855.480570
std     44429.314697
min         1.000000
25%      6931.250000
50%     68068.000000
75%    100293.250000
max    131262.000000
Reading in ratings csv file... (time 1575693637192.5051)
Description of ratings csv file: (time 1575693640892.8904)
             userId       movieId        rating
count  1.194658e+07  1.194658e+07  1.194658e+07
mean   6.935886e+04  2.158727e+03  5.038196e-01
std    3.991005e+04  5.254569e+03  4.999854e-01
min    1.000000e+00  1.000000e+00  0.000000e+00
25%    3.487300e+04  4.740000e+02  0.000000e+00
50%    6.935600e+04  1.241000e+03  1.000000e+00
75%    1.039180e+05  2.329000e+03  1.000000e+00
max    1.384930e+05  1.204660e+05  1

KeyboardInterrupt: ignored