In [1]:
# Importing packages and viewing training data
import pandas as pd
import numpy as np
from scipy.stats import bernoulli
import copy
training_data = pd.read_csv("training_data.csv")
training_data.head(20)

Unnamed: 0,user_id,anime_id,rating
0,257619,31043,10
1,257619,5958,8
2,257619,28171,9
3,257619,16498,7
4,257619,205,10
5,257619,30015,8
6,257619,13601,8
7,257619,2251,8
8,257619,30654,8
9,257619,24833,8


In [2]:
# Converting data points from list format to matrix format
# data_points conntains training data in matrix form. It has 5000 rows each corresponding to one user.
# In the array "indices", if a  cell contains value i != 0 and has index j, then column j of data_points corrresponds to ratings given to anime with anime_id i
anime_ids = training_data['anime_id'].unique()
anime_ids = np.sort(anime_ids)

indices = np.zeros(np.amax(anime_ids)+1)

for i in range(anime_ids.size):
    indices[anime_ids[i]] = i
    
data_points = np.zeros([5000,anime_ids.size])

counter = 0

training_data = training_data.append(pd.DataFrame([[0,0,11]], columns = ['user_id','anime_id','rating']),ignore_index = True)

for i in range(5000):
    if training_data['rating'].iloc[counter] == 11:
        break
    while 1:
        data_points[i,int(indices[training_data['anime_id'].iloc[counter]])] = training_data['rating'].iloc[counter]
        counter += 1
        if training_data['user_id'].iloc[counter] != training_data['user_id'].iloc[counter+1]:
            break

In [3]:
# Defining function to count number of common views a given point has with each training point

def stats(point):
    a = np.zeros(5000)
    for i in range(5000):
        for j in range(point.size):
            if point[j] > 0 and data_points[i, j] > 0:
                a[i] += 1
    return a

In [4]:
# Defining recommmender function
# We find k neareset neighbours with data point which have atleast 12 common views. 
# Then we take average of the anime scores of those k users.
# We recommend any show which has thus calculated average greater than 8
# We do not recommend shows to users which have less than k users with whom common views are greater than 12

def recommend(point):
    common_views = stats(point)
    favourable_users = 0
    k = 7
    for i in range(5000):
        if common_views[i] >= 12:
            favourable_users += 1
    if favourable_users < k:
        return np.zeros(point.size)
    distance = 1000*np.ones(5000)
    for i in range(5000):
        if common_views[i] >= 12:
            distance[i] = 0
            count = 0
            for j in range(point.size):
                if data_points[i, j] > 0:
                    distance[i] += (data_points[i, j] - point[j])**2
                    count += 1
            distance[i] = distance[i]/count
    nearest_neighbours = np.argpartition(distance,-k)[-k:]
    recommendations = np.zeros(point.size)
    for i in range(point.size):
        if point[i] == 0:
            count = 0
            average = 0
            for j in range(k):
                if data_points[nearest_neighbours[j], i] > 0:
                    average += data_points[nearest_neighbours[j], i]
                    count += 1
            if count > 0:
                average = average/count
                if average > 8:
                    recommendations[i] = 1
    return recommendations

In [5]:
# We define a benchmark recommender. 
# We recommend any show which an average score greater than 8 among all data_points
benchmark = np.zeros(anime_ids.size)
for i in range(anime_ids.size):
    average = np.sum(data_points[:,i])/np.count_nonzero(data_points[:,i])
    if average > 8:
        benchmark[i] = 1

In [6]:
# We do processing on validation points to set hyperparameters
# We are masking validation point with probability 0.7 so that we can recommend shows based on 70% data and check with rest 30%
validation_data = pd.read_csv("validation_data.csv")
validation_points = np.zeros([1000,anime_ids.size])
counter = 0

validation_data = validation_data.append(pd.DataFrame([[0,0,11]], columns = ['user_id','anime_id','rating']),ignore_index = True)

for i in range(1000):
    if validation_data['rating'].iloc[counter] == 11:
        break
    while 1:
        validation_points[i,int(indices[validation_data['anime_id'].iloc[counter]])] = validation_data['rating'].iloc[counter]
        counter += 1
        if validation_data['user_id'].iloc[counter] != validation_data['user_id'].iloc[counter+1]:
            break

masked_validation_points = copy.deepcopy(validation_points)
for i in range(1000):
    a = bernoulli.rvs(p = 0.7, size = anime_ids.size)
    masked_validation_points[i,:] = validation_points[i,:]*a

In [7]:
# We give recommendations to each masked validation point and give recommendations
# We then test performance of model
# We also check benchmark performance and performance of a random recommender to see how well our recommender is performing

# Giving recommendations
recommendations = np.zeros([1000,anime_ids.size])
for i in range(1000):
    recommendations[i,:] = recommend(masked_validation_points[i,:])

# Checking recommender performance
good_recommendations = 0
bad_recommendations = 0
neutral_recommendations = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if recommendations[i,j] == 1 and validation_points[i,j] > 0:
            if validation_points[i,j] >= 8:
                good_recommendations += 1
            elif validation_points[i,j] < 4:
                bad_recommendations += 1
            elif validation_points[i,j] in range(4,8):
                neutral_recommendations += 1

print("recommender performance:", round(100.0*good_recommendations/(good_recommendations+bad_recommendations+neutral_recommendations)))

# Checking random performance
good = 0
bad = 0
neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if validation_points[i,j] > 0 and masked_validation_points[i,j] == 0:
            if validation_points[i,j] >= 8:
                good += 1
            elif validation_points[i,j] < 4:
                bad += 1
            elif validation_points[i,j] in range(4,8):
                neutral += 1

print("random performance:", round(100.0*good/(good+bad+neutral)))

# Checking benchmark perfoormance
benchmark_good = 0
benchmark_bad = 0
benchmark_neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if validation_points[i,j] > 0 and masked_validation_points[i,j] == 0 and benchmark[j] == 1:
            if validation_points[i,j] >= 8:
                benchmark_good += 1
            elif validation_points[i,j] < 4:
                benchmark_bad += 1
            elif validation_points[i,j] in range(4,8):
                benchmark_neutral += 1

print("benchmark performance:", round(100.0*benchmark_good/(benchmark_good+benchmark_bad+benchmark_neutral)))

# Checking how many users out of 1000, our recommender gave recommendations to
usefulness = np.count_nonzero(np.sum(recommendations, axis = 1))
print("usefulness:", round(0.1*usefulness))


recommender performance: 74
random performance: 68
benchmark performance: 79
usefulness: 96


In [8]:
# Preprocessing testing data just like validation data
testing_data = pd.read_csv("testing_data.csv")
testing_points = np.zeros([1000,anime_ids.size])
counter = 0

testing_data = testing_data.append(pd.DataFrame([[0,0,11]], columns = ['user_id','anime_id','rating']),ignore_index = True)

for i in range(1000):
    if testing_data['rating'].iloc[counter] == 11:
        break
    while 1:
        testing_points[i,int(indices[testing_data['anime_id'].iloc[counter]])] = testing_data['rating'].iloc[counter]
        counter += 1
        if testing_data['user_id'].iloc[counter] != testing_data['user_id'].iloc[counter+1]:
            break

masked_testing_points = copy.deepcopy(testing_points)
for i in range(1000):
    a = bernoulli.rvs(p = 0.7, size = anime_ids.size)
    masked_testing_points[i,:] = testing_points[i,:]*a

In [9]:
# Giving recommendations to each testing point and measuring performance just as in case of validation points

# Giving recommendations
recommendations = np.zeros([1000,anime_ids.size])
for i in range(1000):
    recommendations[i,:] = recommend(masked_testing_points[i,:])

# Checking recommender performance
good_recommendations = 0
bad_recommendations = 0
neutral_recommendations = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if recommendations[i,j] == 1 and testing_points[i,j] > 0:
            if testing_points[i,j] >= 8:
                good_recommendations += 1
            elif testing_points[i,j] < 4:
                bad_recommendations += 1
            elif testing_points[i,j] in range(4,8):
                neutral_recommendations += 1

print("recommender performance:", round(100.0*good_recommendations/(good_recommendations+bad_recommendations+neutral_recommendations)))

# Checking random performance
good = 0
bad = 0
neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if testing_points[i,j] > 0 and masked_testing_points[i,j] == 0:
            if testing_points[i,j] >= 8:
                good += 1
            elif testing_points[i,j] < 4:
                bad += 1
            elif testing_points[i,j] in range(4,8):
                neutral += 1

print("random performance:", round(100.0*good/(good+bad+neutral)))

# Checking benchmark performance
benchmark_good = 0
benchmark_bad = 0
benchmark_neutral = 0
for i in range(1000):
    for j in range(anime_ids.size):
        if testing_points[i,j] > 0 and masked_testing_points[i,j] == 0 and benchmark[j] == 1:
            if testing_points[i,j] >= 8:
                benchmark_good += 1
            elif testing_points[i,j] < 4:
                benchmark_bad += 1
            elif testing_points[i,j] in range(4,8):
                benchmark_neutral += 1

print("benchmark performance:", round(100.0*benchmark_good/(benchmark_good+benchmark_bad+benchmark_neutral)))

# Checking how many users out of 1000, our recommender gave recommendations to
usefulness = np.count_nonzero(np.sum(recommendations, axis = 1))
print("usefulness:", round(0.1*usefulness))


recommender performance: 75
random performance: 69
benchmark performance: 80
usefulness: 96


In [10]:
# Printing recommendations for a user to practically see the recommendations given to him
recom = recommend(testing_points[10,:])
for i in range(anime_ids.size):
    if recom[i] == 1:
        print(np.where(indices == i))

(array([6]),)
(array([20]),)
(array([71]),)
(array([164]),)
(array([199]),)
(array([245]),)
(array([269]),)
(array([431]),)
(array([523]),)
(array([1535]),)
(array([1575]),)
(array([1818]),)
(array([2025]),)
(array([2476]),)
(array([2904]),)
(array([6045]),)
(array([6213]),)
(array([9041]),)
(array([9756]),)
(array([11111]),)
(array([11499]),)
(array([11771]),)
(array([11887]),)
(array([13759]),)
(array([14813]),)
(array([15225]),)
(array([15451]),)
(array([16894]),)
(array([17895]),)
(array([18115]),)
(array([18679]),)
(array([18897]),)
(array([19815]),)
(array([23283]),)
(array([23289]),)
(array([25013]),)
(array([26243]),)
(array([28223]),)
(array([28891]),)
(array([32935]),)
