In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data_anime = pd.read_csv('./dataset/anime.csv')

In [3]:
train_data_anime = train_data_anime.drop(["Name", "English name", "Date of premiere", "When aired", "Duration"], axis=1)

In [4]:
train_data_anime['Avg score'].value_counts()
train_data_anime['Avg score'] = train_data_anime['Avg score'].apply(lambda x: '5.00' if x == 'Unknown' else x)
train_data_anime['Avg score'] = train_data_anime['Avg score'].apply(lambda x: float(x))

In [5]:
def ListToVec(names_list, data):
    splitted = data.replace(' ', '').split(',')
    vector = [0 for x in range(len(names_list))]
    
    for value in splitted:
        vector[names_list.index(value)] = 1
    
    return vector

def PreprocessList(data, column_name):
    values = set()

    for row in train_data_anime[column_name]:
        for value in row.replace(' ', '').split(','):
            values.add(value)

    values_list = list(values) 
    data[column_name] = data[column_name].apply(lambda x: ListToVec(values_list, x))

def PreprocessColumn(data, column_name):
    values = set()
    for row in train_data_anime[column_name]:
            values.add(row)
    values_list = list(values)
    data[column_name] = data[column_name].apply(lambda x: values_list.index(x))

In [6]:
PreprocessColumn(train_data_anime, 'Type')
PreprocessColumn(train_data_anime, 'Age limit')
PreprocessColumn(train_data_anime, 'Based on')

PreprocessList(train_data_anime, 'Genres')
PreprocessList(train_data_anime, 'Producers')
PreprocessList(train_data_anime, 'Licensors')
PreprocessList(train_data_anime, 'Studios')

In [7]:
train_data_anime['Score count'] = float(0)

for i in range(1, 11):
    train_data_anime['Score-' + str(i)] = train_data_anime['Score-' + str(i)].apply(lambda x: '0.0' if x == 'Unknown' else x)
    train_data_anime['Score-' + str(i)] = train_data_anime['Score-' + str(i)].apply(lambda x: float(x))
    train_data_anime['Score count'] += train_data_anime['Score-' + str(i)]
    train_data_anime = train_data_anime.drop('Score-' + str(i), axis=1)

In [8]:
train_data_anime['Num episodes'] = train_data_anime['Num episodes'].apply(lambda x: '36' if x == 'Unknown' else x)
train_data_anime['Num episodes'] = train_data_anime['Num episodes'].apply(lambda x: int(x))

In [9]:
categorical_columns = [x for x in train_data_anime.columns if train_data_anime[x].dtype == 'object']

In [10]:
def NormalizeMinMax(data, name):
    data[name] = (data[name] - data[name].min()) / (data[name].max() - data[name].min())
NormalizeMinMax(train_data_anime, 'Avg score')
NormalizeMinMax(train_data_anime, 'Group members')
NormalizeMinMax(train_data_anime, 'Num episodes')
NormalizeMinMax(train_data_anime, 'In list')
NormalizeMinMax(train_data_anime, 'In favourites')
NormalizeMinMax(train_data_anime, 'Watching')
NormalizeMinMax(train_data_anime, 'Finished')
NormalizeMinMax(train_data_anime, 'On hold')
NormalizeMinMax(train_data_anime, 'Dropped')
NormalizeMinMax(train_data_anime, 'Finished')
NormalizeMinMax(train_data_anime, 'Score count')

In [11]:
train_data_users = pd.read_csv('./dataset/user_ratings.csv')
train_data_users.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,121,8,1,0
1,0,12815,10,1,3
2,0,3588,9,1,13
3,0,392,8,3,0
4,1,1575,8,2,25


In [12]:
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
import os
import skorch
import copy
from skorch import NeuralNetRegressor
from skorch import NeuralNet
from functools import reduce
device = torch.device('cuda:0')

In [13]:
class EncodeModel(nn.Module):
    def __init__(self, input_size, hidden_size, code_size):
        super(EncodeModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.code_size = code_size
        self.encode = nn.Sequential(
            #Encoder
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.code_size),
            nn.ReLU()
        )
        self.decode = nn.Sequential(
            #Decoder
            nn.Linear(self.code_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )
    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x
    def encodeData(self, x):
        return self.encode(x)

def encode_cloumn(data, name, epochs = 10, predcompr = 4, compress = 6):
    encoder = NeuralNet(
        EncodeModel,
        module__input_size = len(data[name][0]),
        module__hidden_size = len(data[name][0]) // predcompr,
        module__code_size= len(data[name][0]) // compress,
        optimizer=torch.optim.Adam,
        max_epochs=epochs,
        lr=0.001,
        criterion=nn.MSELoss,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
        batch_size=32,
        device=device
    )
    data_tensor = torch.tensor(data[name], dtype=torch.float).to(device)
    encoder.fit(data_tensor, data_tensor)
    res = encoder.module_.encodeData(data_tensor)
    data[name] = [x for x in res.cpu().detach().numpy()]

In [14]:
encode_cloumn(train_data_anime, 'Genres', 40)
encode_cloumn(train_data_anime, 'Producers', 11, 10, 20)
encode_cloumn(train_data_anime, 'Studios', 7, 10, 20)

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m0.1236[0m        [32m0.0927[0m  1.0402
      2        [36m0.0582[0m        [32m0.0834[0m  0.8596
      3        [36m0.0508[0m        [32m0.0764[0m  0.8480
      4        [36m0.0426[0m        [32m0.0673[0m  0.8432
      5        [36m0.0387[0m        [32m0.0648[0m  0.8720
      6        [36m0.0357[0m        [32m0.0595[0m  0.8622
      7        [36m0.0324[0m        [32m0.0561[0m  0.8567
      8        [36m0.0311[0m        [32m0.0548[0m  0.8489
      9        [36m0.0303[0m        [32m0.0543[0m  0.8517
     10        [36m0.0298[0m        [32m0.0541[0m  0.8549
     11        [36m0.0294[0m        [32m0.0539[0m  0.8512
     12        [36m0.0290[0m        [32m0.0531[0m  0.8438
     13        [36m0.0287[0m        [32m0.0528[0m  0.8485
     14        [36m0.0283[0m        [32m0.0521[0m  0.8499
     15        [36m0.0279[0m        [32m0

In [15]:
train_data_anime = train_data_anime.drop('Type', axis=1)
train_data_anime = train_data_anime.drop('Based on', axis=1)
train_data_anime = train_data_anime.drop('Age limit', axis=1)
train_data_anime = train_data_anime.drop('In list', axis=1)
train_data_anime = train_data_anime.drop('Group members', axis=1)
train_data_anime = train_data_anime.drop('Watching', axis=1)
train_data_anime = train_data_anime.drop('On hold', axis=1)
train_data_anime = train_data_anime.drop('Planning to watch', axis=1)
train_data_anime = train_data_anime.drop('Num episodes', axis=1)
train_data_anime = train_data_anime.drop('Licensors', axis=1)

In [16]:
train_data_anime.head()

Unnamed: 0,Id,Avg score,Genres,Producers,Studios,In favourites,Finished,Dropped,Score count
0,1,0.944142,"[0.78121674, 7.308255, 10.646358, 0.49348742, ...","[0.6279943, 0.7098011, 0.26429427, 0.6122097, ...","[0.0, 2.50786, 0.77700853, 4.7707176, 0.0, 0.0...",0.336956,0.329041,0.152699,0.351294
1,5,0.891008,"[1.8854098, 6.075355, 10.155744, 0.0, 0.0, 8.9...","[0.78828627, 0.83088404, 0.42312813, 0.7712970...","[0.0, 2.4009078, 0.50481653, 4.6601853, 0.0, 0...",0.006383,0.095452,0.004407,0.087781
2,6,0.870572,"[0.082239255, 5.939074, 12.262469, 0.0, 0.0, 9...","[0.45875385, 0.72369117, 0.42661017, 0.5238245...","[0.0, 3.7779198, 0.8860091, 1.2290316, 0.0, 0....",0.070381,0.157378,0.079704,0.156647
3,7,0.73842,"[1.1217325, 0.0, 6.7350698, 1.996226, 0.0, 3.0...","[1.1320529, 1.4605576, 1.1861373, 1.3307332, 1...","[0.0, 2.50786, 0.77700853, 4.7707176, 0.0, 0.0...",0.003192,0.021152,0.030782,0.021402
4,8,0.69891,"[3.6080043, 5.4485984, 2.9694827, 3.853971, 0....","[0.8291424, 1.2370769, 1.093889, 1.1105534, 0....","[0.0, 0.0, 0.28164282, 6.045031, 0.0, 0.0, 2.8...",9.8e-05,0.003351,0.006342,0.003242


In [17]:
data_anime = dict()
for row in train_data_anime.itertuples():
    l = list()
    for value in row:
        value = list(value) if type(value) is np.ndarray else list([value])
        l = l + value

    data_anime[l[1]] = l[3:]

In [18]:
user_rating_list = [[] for _ in range(max(train_data_users['user_id']) + 1)]
for row in train_data_users.itertuples():
    if (row[3] != 0):
        user_rating_list[row[1]].append([row[2], row[3]])

In [19]:
user_count = 70000

In [20]:
targets = []

for user in user_rating_list[:user_count]:
    if (len(user) > 0):     
        max_value = max(user, key=lambda item: item[1])[1]
        min_value = min(user, key=lambda item: item[1])[1]

        for pair in user:
            if max_value - min_value != 0:
                targets.append((pair[1] - min_value)/(max_value - min_value))
            else:
                targets.append(pair[1])

i = 0
user_input = list()
anime_input = list()
for user in user_rating_list[:user_count]:
    for pair in user:
        user_input.append(i)
        anime_input.append(data_anime[pair[0]])
    
    i += 1


In [21]:
tensor_user_input = torch.tensor(user_input).reshape(-1, 1).to(device)
tensor_anime_input = torch.tensor(anime_input).to(device)
tensor_output = torch.tensor(targets).reshape(-1,1).to(device)

tensor_user_input.shape, tensor_anime_input.shape, tensor_output.shape

(torch.Size([5510475, 1]),
 torch.Size([5510475, 101]),
 torch.Size([5510475, 1]))

In [22]:
embedding_size = 50

In [23]:
class DiffModel(nn.Module):
    def __init__(self):
        super(DiffModel, self).__init__()
        self.embedding_vocabulary_size = 1100000
        self.user_embedding_len = embedding_size
        self.anime_embedding_len = 101
        self.len = self.user_embedding_len + self.anime_embedding_len

        self.sequen = nn.Sequential(
            nn.Linear(self.len, self.len // 10),
            nn.PReLU(),
            nn.Linear(self.len // 10, 1),
            nn.Sigmoid()
        )

        self.embedding = nn.Embedding(self.embedding_vocabulary_size, self.user_embedding_len)

    def forward(self, user_id, anime_embedding):
        user_embedding = self.embedding(user_id)
        user_embedding = user_embedding.reshape(-1, self.user_embedding_len)
        
        embedding = torch.cat((user_embedding, anime_embedding), 1) 
        result = self.sequen(embedding)

        del user_embedding
        del embedding
        return result

In [24]:
diffnet = NeuralNet(
    DiffModel,
    optimizer=torch.optim.Adam,
    max_epochs=6,
    lr=0.01,
    criterion=nn.MSELoss,
    iterator_train__shuffle=True,
    batch_size=32,
    device=device
)

In [25]:
X = {
    'user_id': tensor_user_input,
    'anime_embedding': tensor_anime_input
}

diffnet.fit(X, tensor_output)

<class 'skorch.net.NeuralNet'>[initialized](
  module_=DiffModel(
    (sequen): Sequential(
      (0): Linear(in_features=151, out_features=15, bias=True)
      (1): PReLU(num_parameters=1)
      (2): Linear(in_features=15, out_features=1, bias=True)
      (3): Sigmoid()
    )
    (embedding): Embedding(1100000, 50)
  ),
)

In [26]:
submission = pd.read_csv('./dataset/sample_submission.csv')

In [27]:
subm = []
for el in submission['Id']:
    pair = el.split(' ')
    subm.append((int(pair[0]), int(pair[1])))

In [28]:
def calcscore(id_user, id_anime, i):
    user = user_rating_list[id_user]
    if len(user) <= 1:
        return train_data_anime[train_data_anime['Id'] == id_anime]['Avg score']
    else:
        max_value = max(user, key=lambda item: item[1])[1]
        min_value = min(user, key=lambda item: item[1])[1]

        anime_input = torch.tensor(data_anime[id_anime]).reshape(1, -1).to(device)
        user_input = torch.tensor([id_user]).reshape(1, -1).to(device)
        
        pred = diffnet.predict({'user_id': user_input, 'anime_embedding': anime_input})

        del anime_input
        del user_input

        return min_value + pred * (max_value - min_value)

result = []
progr = 0
for elem in subm:
    result.append(calcscore(elem[0], elem[1], progr))
    progr += 1
    if (progr % 10000 == 0):
        print(progr)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000


In [29]:
proceed = []
for el in result:
    if isinstance(el, np.ndarray):
        proceed.append(int(round(el.take(0))))
    else:
        proceed.append(int(round(9 * el + 1)))

In [30]:
len(submission)

876529

In [31]:
file = open("submission.csv", "w")
file.write('Id,rating\n')
for id, rait in zip(subm,proceed):
    file.write('{} {},{}\n'.format(id[0], id[1], rait))

file.close()