# P3Alpha

This implementation is too slow.

### Import libraries

In [148]:
import pandas as pd
import numpy as np
import os
import cython

In [149]:
%load_ext Cython

In [145]:
def pretty_print_progress(current, total, prepend):
    if current == total:
        print(" " * 100, end="\r")
        print(prepend, "finished!")
    elif current % 100 == 0:
        print("%s %8s of %8s" % (prepend, current, total), end="\r")

### Import dataset

In [121]:
base_path = "data"
interactions_df_path = os.path.join(base_path, "interactions_and_impressions.csv")
items_length_df_path = os.path.join(base_path, "data_ICM_length.csv")
items_type_df_path = os.path.join(base_path, "data_ICM_type.csv")
users_df_path = os.path.join(base_path, "data_target_users_test.csv")

In [122]:
dtype={0:int, 1:int, 2:str, 3:int}
interactions_df = pd.read_csv(filepath_or_buffer=interactions_df_path, dtype=dtype)
items_length_df = pd.read_csv(filepath_or_buffer=items_length_df_path)
items_types_df = pd.read_csv(filepath_or_buffer=items_type_df_path)
users_df = pd.read_csv(filepath_or_buffer=users_df_path)

### Calculate implicit ratings

In [123]:
implicit_ratings_df = pd.DataFrame()
implicit_ratings_df = interactions_df.groupby(['user_id', 'item_id'], as_index=False).max(['data'])
implicit_ratings_df.head()

Unnamed: 0,user_id,item_id,data
0,0,11,1
1,0,21,1
2,0,22,1
3,0,24,1
4,0,44,1


In [124]:
df = implicit_ratings_df
df[df['data'] == 0].head()

Unnamed: 0,user_id,item_id,data
6,0,124,0
11,0,808,0
25,0,5068,0
32,0,7603,0
34,0,8540,0


In [125]:
# remove only viewed elements
df = implicit_ratings_df
implicit_ratings_df = df[df['data'] == 1].reset_index(drop=True)
implicit_ratings_df.head(10)

Unnamed: 0,user_id,item_id,data
0,0,11,1
1,0,21,1
2,0,22,1
3,0,24,1
4,0,44,1
5,0,54,1
6,0,159,1
7,0,239,1
8,0,575,1
9,0,751,1


### Factorize dataframe

In [126]:
users_ids = implicit_ratings_df["user_id"].sort_values().unique()
items_ids = implicit_ratings_df["item_id"].sort_values().unique()

In [127]:
# users to recommend
num_users = users_df['user_id'].shape[0]
# users of whom we have interactions
num_total_users = users_ids.shape[0]
num_items = items_ids.shape[0]
print("Found {} users and {} items".format(num_users, num_items))
print("There are {} users with interactions and {} to recommend".format(num_total_users, num_users))

Found 41116 users and 24507 items
There are 41607 users with interactions and 41116 to recommend


In [128]:
items_mapped_ids, items_original_ids = pd.factorize(items_ids)

item_mapped_id_to_original_id = pd.Series(items_original_ids, index=items_mapped_ids)
item_original_id_to_mapped_id = pd.Series(items_mapped_ids, index=items_original_ids)

In [129]:
users_mapped_ids, users_original_ids = pd.factorize(users_ids)

user_mapped_id_to_original_id = pd.Series(users_original_ids, index=users_mapped_ids)
user_original_id_to_mapped_id = pd.Series(users_mapped_ids, index=users_original_ids)

### Find URM

In [130]:
URM = np.zeros((num_total_users, num_items), dtype=np.int8)

for user_id in users_mapped_ids:
    df = implicit_ratings_df
    user_items = df[df['user_id'] == user_id]['item_id']
    for item_id in user_items:
        item_id = item_original_id_to_mapped_id[item_id]
        URM[user_id, item_id] = 1

### Calculate probabilities

In [131]:
shrink = 1

In [132]:
# TODO: try removing one to the sum
rows_prob = np.zeros(num_total_users, dtype=np.float16)
for i in range(num_total_users):
    rows_prob[i] = URM[i, :].sum() / (num_items + shrink)

In [133]:
print(rows_prob[:30])

[0.002367  0.0003264 0.002775  0.004406  0.000979  0.0003264 0.0005302
 0.0010605 0.0002856 0.0002449 0.000408  0.001877  0.001265  0.000204
 0.0002856 0.0005302 0.0001632 0.0003672 0.0002856 0.0009384 0.0002449
 0.0002449 0.0003672 0.0003672 0.000408  0.001796  0.0005713 0.0011425
 0.0009384 0.0002856]


In [134]:
cols_prob = np.zeros(num_items, dtype=np.float16)
for i in range(num_items):
    cols_prob[i] = URM[:, i].sum() / (num_total_users + shrink)

In [135]:
print(cols_prob[:30])

[0.000529  0.001274  0.0006247 0.000529  0.000793  0.00678   0.006226
 0.002523  0.0004807 0.001009  0.000577  0.000913  0.0004325 0.0009375
 0.001779  0.0006247 0.0003605 0.0004325 0.0001682 0.008316  0.04745
 0.1128    0.05902   0.0429    0.0168    0.03647   0.006298  0.016
 0.02019   0.0403   ]


### Calculate custom graph

In [136]:
user_nodes = {}

for user_mapped_id in range(num_total_users):
    # map counter with original id
    user_original_id = user_mapped_id_to_original_id[user_mapped_id]
    
    # extract user items
    df = implicit_ratings_df
    user_items = df[df['user_id'] == user_original_id]['item_id']
    
    # prepare new node
    user_nodes[user_mapped_id] = []
    # save connections for this node
    for item_original_id in user_items:
        item_mapped_id = item_original_id_to_mapped_id[item_original_id]
        user_nodes[user_mapped_id].append(item_mapped_id)

In [137]:
item_nodes = {}

for item_mapped_id in range(num_items):
    # map counter with original id
    item_original_id = item_mapped_id_to_original_id[item_mapped_id]
    
    # extract user items
    df = implicit_ratings_df
    item_users = df[df['item_id'] == item_original_id]['user_id']
    
    # prepare new node
    item_nodes[item_mapped_id] = []
    # save connections for this node
    for user_original_id in item_users:
        user_mapped_id = user_original_id_to_mapped_id[user_original_id]
        item_nodes[item_mapped_id].append(user_mapped_id)

In [138]:
print("{} nodes of {} users".format(len(user_nodes), num_users))
print("{} nodes of {} items".format(len(item_nodes), num_items))

41607 nodes of 41116 users
24507 nodes of 24507 items


### Calculate recommendations

In [158]:
%%cython
def calculate_jumps():
    
    global user_nodes
    global item_nodes
    global num_users
    global rows_prob
    global cols_prob

    users_jumps = []

    for user1_id in range(num_users):
        
        user1_items = user_nodes[user1_id]
        
        user_jumps = {}
        
        p1 = rows_prob[user1_id]
        
        for item1_id in user_nodes[user1_id]:
            
            p2 = cols_prob[item1_id]
            
            for user2_id in item_nodes[item1_id]:
                
                p3 = rows_prob[user2_id]
                
                for item2_id in user_nodes[user2_id]:
                    
                    if item2_id not in user_jumps.keys():
                        user_jumps[item2_id] = []
            
                    user_jumps[item2_id].append(user_jumps[item2_id], p1 * p2 * p3)
                    
        users_jumps.append(user_jumps)
        
    return user_jumps

In [152]:
calculate_jumps(user_nodes, item_nodes, num_users)

NameError: name 'calculate_jumps' is not defined