In [None]:
import numpy as np
import pandas as pd
from more_itertools import chunked
from pmf import ProbabilisticMatrixFactorization

In [None]:
file_name = '../archive/combined_data'
movies = []
user_ratings = {}
current_movie = None

for i in range(1, 5):
    with open(f'{file_name}_{i}.txt', 'r') as current_file:
        print(f'Processing combined_data_{i}.txt...')
        j = 0
        for line in current_file.readlines():
            elems = line.split(',')
            j += 1
            if j % 5_000_000 == 0:
                print(f'\t{j} lines processed...')
            if len(elems) == 1:
                movie = elems[0].strip(':\n')
                movies.append(movie)
                current_movie = movie
            elif len(elems) == 3:
                user_id, user_rating = elems[0], int(elems[1])
                ratings = user_ratings.get(user_id, {})
                ratings[current_movie] = user_rating
                user_ratings[user_id] = ratings
        print(f'\t{j} total lines processed.')

In [None]:
N = len(list(user_ratings.keys()))
print(f'Number of total users: {N}')

In [None]:
test_file_name = '../archive/probe.txt'
test_movies = {}
current_movie = None

with open(test_file_name, 'r') as test_file:
    for line in test_file.readlines():
        if line.endswith(':\n'):
            test_movies[line.strip(':\n')] = []
            current_movie = line.strip(':\n')
        else:
            user_list = test_movies.get(current_movie)
            user_list.append(line)
            test_movies[current_movie] = user_list

test_users = set([elem for sublist in list(test_movies.values()) for elem in sublist])

In [None]:
batch_size = 100_000
j = 1
for chunk in chunked(user_ratings.keys(), batch_size):
    print(f'Processing batch number {j} -> {len(chunk)} users')
    data = []
    for i, key in enumerate(chunk):
        data.append([])
        for movie in movies:
            if movie in user_ratings[key].keys():
                data[-1].append(user_ratings[key][movie])
            else:
                data[-1].append(0)
        if (i+1) % 10_000 == 0:
            print(f'\t{i+1} users processed...')
    pmf = ProbabilisticMatrixFactorization(D=10, sigma=0.1, sigma_u=0.1, sigma_v=0.1, max_epochs=10)
    print(f'Fitting new values with PMF model...')
    pmf.fit(np.array(data))
    print(f'Batch {j} processed!')
    j += 1