In [1]:
import pandas as pd
import numpy as np
from scipy.stats import bernoulli
import copy
from sklearn.decomposition import TruncatedSVD
training_data = pd.read_csv("training_data.csv")
training_data.head(20)

Unnamed: 0,user_id,anime_id,rating
0,257619,31043,10
1,257619,5958,8
2,257619,28171,9
3,257619,16498,7
4,257619,205,10
5,257619,30015,8
6,257619,13601,8
7,257619,2251,8
8,257619,30654,8
9,257619,24833,8


In [2]:
anime_ids = training_data['anime_id'].unique()
anime_ids = np.sort(anime_ids)

indices = np.zeros(np.amax(anime_ids)+1)

for i in range(anime_ids.size):
    indices[anime_ids[i]] = i
    
data_points = np.zeros([5000,anime_ids.size])

counter = 0

training_data = training_data.append(pd.DataFrame([[0,0,11]], columns = ['user_id','anime_id','rating']),ignore_index = True)

for i in range(5000):
    if training_data['rating'].iloc[counter] == 11:
        break
    while 1:
        data_points[i,int(indices[training_data['anime_id'].iloc[counter]])] = training_data['rating'].iloc[counter]
        counter += 1
        if training_data['user_id'].iloc[counter] != training_data['user_id'].iloc[counter+1]:
            break

In [3]:
average_anime_rating = np.zeros(anime_ids.size)
average_user_rating = np.zeros(5000)
imputed_data_points = copy.deepcopy(data_points)
for i in range(anime_ids.size):
    average_anime_rating[i] = sum(data_points[:,i])/np.count_nonzero(data_points[:,i])
for i in range(5000):
    average_user_rating[i] = sum(data_points[i,:])/np.count_nonzero(data_points[i,:])
for i in range(5000):
    for j in range(anime_ids.size):
        if data_points[i,j] == 0:
            imputed_data_points[i,j] = (average_user_rating[i] * average_anime_rating[j])**(0.5)

In [4]:
components = 252
svd = TruncatedSVD(n_components = components)
svd.fit(imputed_data_points)
decomposed_data_points = svd.transform(imputed_data_points)

In [5]:
print(sum(svd.singular_values_))

19921.683149276672


In [6]:
def recommend(point):
    k = 150
    imputed_point = copy.deepcopy(point)
    average_rating_by_user = sum(point)/np.count_nonzero(point)
    for j in range(anime_ids.size):
        if point[j] == 0:
            imputed_point[j] = (average_rating_by_user * average_anime_rating[j])**(0.5)
    distance = 1000*np.ones(5000)
    decomposed_point = svd.transform([point,point])[0,:]
    for i in range(5000):
        distance[i] = np.linalg.norm(decomposed_point - decomposed_data_points[i,:])
    nearest_neighbours = np.argpartition(distance,-k)[-k:]
    recommendations = np.zeros(point.size)
    for i in range(point.size):
        if point[i] == 0:
            count = 0
            average = 0
            for j in range(k):
                if data_points[nearest_neighbours[j], i] > 0:
                    average += data_points[nearest_neighbours[j], i]
                    count += 1
            if count > 0:
                average = average/count
                if average > 8:
                    recommendations[i] = 1
    return recommendations

In [7]:
benchmark = np.zeros(anime_ids.size)
for i in range(anime_ids.size):
    average = np.sum(data_points[:,i])/np.count_nonzero(data_points[:,i])
    if average > 8:
        benchmark[i] = 1

In [8]:
validation_data = pd.read_csv("validation_data.csv")
validation_points = np.zeros([1000,anime_ids.size])
counter = 0

validation_data = validation_data.append(pd.DataFrame([[0,0,11]], columns = ['user_id','anime_id','rating']),ignore_index = True)

for i in range(1000):
    if validation_data['rating'].iloc[counter] == 11:
        break
    while 1:
        validation_points[i,int(indices[validation_data['anime_id'].iloc[counter]])] = validation_data['rating'].iloc[counter]
        counter += 1
        if validation_data['user_id'].iloc[counter] != validation_data['user_id'].iloc[counter+1]:
            break

masked_validation_points = copy.deepcopy(validation_points)
for i in range(1000):
    a = bernoulli.rvs(p = 0.7, size = anime_ids.size)
    masked_validation_points[i,:] = validation_points[i,:]*a

In [9]:
recommendations = np.zeros([1000,anime_ids.size])
for i in range(1000):
    recommendations[i,:] = recommend(masked_validation_points[i,:])

good_recommendations = 0
bad_recommendations = 0
neutral_recommendations = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if recommendations[i,j] == 1 and validation_points[i,j] > 0:
            if validation_points[i,j] >= 8:
                good_recommendations += 1
            elif validation_points[i,j] < 4:
                bad_recommendations += 1
            elif validation_points[i,j] in range(4,8):
                neutral_recommendations += 1

print("recommender performance:", round(100.0*good_recommendations/(good_recommendations+bad_recommendations+neutral_recommendations)))

good = 0
bad = 0
neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if validation_points[i,j] > 0 and masked_validation_points[i,j] == 0:
            if validation_points[i,j] >= 8:
                good += 1
            elif validation_points[i,j] < 4:
                bad += 1
            elif validation_points[i,j] in range(4,8):
                neutral += 1

print("random performance:", round(100.0*good/(good+bad+neutral)))

benchmark_good = 0
benchmark_bad = 0
benchmark_neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if validation_points[i,j] > 0 and masked_validation_points[i,j] == 0 and benchmark[j] == 1:
            if validation_points[i,j] >= 8:
                benchmark_good += 1
            elif validation_points[i,j] < 4:
                benchmark_bad += 1
            elif validation_points[i,j] in range(4,8):
                benchmark_neutral += 1

print("benchmark performance:", round(100.0*benchmark_good/(benchmark_good+benchmark_bad+benchmark_neutral)))

recommender performance: 68
random performance: 67
benchmark performance: 78


In [10]:
testing_data = pd.read_csv("testing_data.csv")
testing_points = np.zeros([1000,anime_ids.size])
counter = 0

testing_data = testing_data.append(pd.DataFrame([[0,0,11]], columns = ['user_id','anime_id','rating']),ignore_index = True)

for i in range(1000):
    if testing_data['rating'].iloc[counter] == 11:
        break
    while 1:
        testing_points[i,int(indices[testing_data['anime_id'].iloc[counter]])] = testing_data['rating'].iloc[counter]
        counter += 1
        if testing_data['user_id'].iloc[counter] != testing_data['user_id'].iloc[counter+1]:
            break

masked_testing_points = copy.deepcopy(testing_points)
for i in range(1000):
    a = bernoulli.rvs(p = 0.7, size = anime_ids.size)
    masked_testing_points[i,:] = testing_points[i,:]*a

In [11]:
recommendations = np.zeros([1000,anime_ids.size])
for i in range(1000):
    recommendations[i,:] = recommend(masked_testing_points[i,:])

good_recommendations = 0
bad_recommendations = 0
neutral_recommendations = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if recommendations[i,j] == 1 and testing_points[i,j] > 0:
            if testing_points[i,j] >= 8:
                good_recommendations += 1
            elif testing_points[i,j] < 4:
                bad_recommendations += 1
            elif testing_points[i,j] in range(4,8):
                neutral_recommendations += 1

print("recommender performance:", round(100.0*good_recommendations/(good_recommendations+bad_recommendations+neutral_recommendations)))

good = 0
bad = 0
neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if testing_points[i,j] > 0 and masked_testing_points[i,j] == 0:
            if testing_points[i,j] >= 8:
                good += 1
            elif testing_points[i,j] < 4:
                bad += 1
            elif testing_points[i,j] in range(4,8):
                neutral += 1

print("random performance:", round(100.0*good/(good+bad+neutral)))

benchmark_good = 0
benchmark_bad = 0
benchmark_neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if testing_points[i,j] > 0 and masked_testing_points[i,j] == 0 and benchmark[j] == 1:
            if testing_points[i,j] >= 8:
                benchmark_good += 1
            elif testing_points[i,j] < 4:
                benchmark_bad += 1
            elif testing_points[i,j] in range(4,8):
                benchmark_neutral += 1

print("benchmark performance:", round(100.0*benchmark_good/(benchmark_good+benchmark_bad+benchmark_neutral)))

recommender performance: 69
random performance: 69
benchmark performance: 81
