In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm 

path = 'eva_netflix_full/'

def load_factors(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')

            obj_id = int(parts[1])
            factors = np.array([float(x) for x in parts[2:]])
            data.append((obj_id, factors))
    return dict(data)

beta = load_factors(path + 'beta.tsv')
theta = load_factors(path + 'theta.tsv')

test_users_df = pd.read_csv(path + 'test_users.tsv', header=None, sep='\t')

#only predict test users
#generally they should be the same but just in case it can cut down the runtime
test_user_ids = test_users_df[0].tolist()
test_users = theta.keys() & set(test_user_ids)

all_items = list(beta.keys())

predictions = []


In [2]:
len(test_users)

139785

In [3]:
for user_id in tqdm(test_users, desc="Processing Users"):
    user_factor = theta[user_id]
    
    for item_id in all_items:
        item_factor = beta[item_id]
        
        pred = np.dot(user_factor, item_factor)
        
        predictions.append({
            'user_id': user_id,
            'item_id': item_id,
            'prediction': pred
        })
        
df = pd.DataFrame(predictions)
df.to_csv('ranking.tsv', sep='\t', index=False, header=False)

print(f"Generated {len(predictions)} predictions!")

Processing Users:   6%|▌         | 8597/139785 [00:35<08:58, 243.44it/s]


KeyboardInterrupt: 

In [None]:
# large file with pointer
with open('ranking.tsv', 'w') as f:
    for user_id in tqdm(test_users, desc="Processing Users"):
        user_factor = theta[user_id]

        for item_id in all_items:
            item_factor = beta[item_id]
            
            pred = np.dot(user_factor, item_factor)
            
            f.write(f"{user_id}\t{item_id}\t{pred}\n")

total_predictions = len(test_users) * len(all_items)
print(f"Generated {total_predictions} predictions!")