# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import os
from os import path
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import time
import copy

In [2]:
data_name = "ML1M" ### Can be ML1M, ML1M_demographic, Yahoo, Pinterest
DP_DIR = Path("processed_data", data_name) 
export_dir = Path(os.getcwd())
files_path = Path(export_dir.parent, DP_DIR)
min_num_of_items_per_user = 2
min_num_of_users_per_item = 2

In [5]:
# Load ML1M data
if data_name == "ML1M" or data_name == "ML1M_demographic":
    data = pd.read_csv(Path(files_path , "ratings.dat"), sep="::", engine="python",
                       names=["user_id_original", "item_id_original", "rating", "timestamp"])

    users = pd.read_csv(Path(files_path, "users.dat"), engine="python",
                         sep="::", names=["user_id_original", "gender", "age", "occupation", "zipcode"],encoding = "ISO-8859-1"
    )
    
    movies = pd.read_csv(Path(files_path, "movies.dat"), engine="python",
                         sep="::", names=["item_id_original", "title", "genres"],encoding = "ISO-8859-1"
    )
# Load Yahoo data
elif data_name == "Yahoo":
    data = pd.read_csv(Path(files_path, "Yahoo_ratings.csv"), names=["user_id_original", "item_id_original", "rating"])

# Load Pinterest data
elif data_name == "Pinterest":
    data = pd.read_csv(Path(files_path, "pinterest_data.csv"), names=["user_id_original", "item_id_original", "rating"])

In [6]:
movies

Unnamed: 0,item_id_original,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


# 2. Data processing

In [7]:
# Convert the ratings to binary values (1 if rating exists, 0 otherwise). 
# Keep only ratings over 70/100.

if data_name=='Yahoo':
    data["rating"] = data["rating"].apply(lambda x: 0 if x == 255 else x) # for Yahoo only
    data["rating"] = data["rating"].apply(lambda x: 1 if x > 70 else 0)
elif data_name=='ML1M' or data_name=="ML1M_demographic":
    data["rating"] = data["rating"].apply(lambda x: 1 if x > 3.5 else 0)

In [8]:
data = data[data['rating']==1]

### recursively delete users and items with too few interactions

In [9]:
n1=1
n2=2

while n1!=n2:
    # save only users with min_num_of_items_per_user items or more
    user_counts = data.groupby(['user_id_original'])['item_id_original'].nunique().reset_index(name='item_count')

    filtered_users = user_counts[user_counts['item_count'] >= min_num_of_items_per_user]['user_id_original']

    data = data[data['user_id_original'].isin(filtered_users)].reset_index(drop=True)
    if data_name=="ML1M_demographic":
        users = users[users['user_id_original'].isin(filtered_users)].reset_index(drop=True)
    n1=data.shape[0]

    # save only items with min_num_of_users_per_item users or more
    item_counts = data.groupby(['item_id_original'])['user_id_original'].nunique().reset_index(name='user_count')

    filtered_items = item_counts[item_counts['user_count'] >= min_num_of_users_per_item]['item_id_original']

    data = data[data['item_id_original'].isin(filtered_items)].reset_index(drop=True)
    if data_name=="ML1M_demographic":
        movies = movies[movies['item_id_original'].isin(filtered_items)].reset_index(drop=True)
    n2=data.shape[0]

In [10]:
data

Unnamed: 0,user_id_original,item_id_original,rating,timestamp
0,1,1193,1,978300760
1,1,3408,1,978300275
2,1,2355,1,978824291
3,1,1287,1,978302039
4,1,2804,1,978300719
...,...,...,...,...
575123,6040,1089,1,956704996
575124,6040,1094,1,956704887
575125,6040,562,1,956704746
575126,6040,1096,1,956715648


In [11]:
# Encode target values
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()
user_encoder.fit(data.user_id_original)

if data_name=="ML1M_demographic":
    item_encoder.fit(movies.item_id_original)
    movies["item_id"] = item_encoder.transform(movies.item_id_original)
    users['user_id'] = user_encoder.transform(users.user_id_original)
else:
    item_encoder.fit(data.item_id_original)

data["user_id"] = user_encoder.transform(data.user_id_original)
data["item_id"] = item_encoder.transform(data.item_id_original)

# Get the number of users and items in the dataset
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

In [12]:
movies

Unnamed: 0,item_id_original,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [13]:
print('num_items = ', num_items, ' num_users = ', num_users)

num_items =  3381  num_users =  6037


## prepare demographics data

In [14]:
if data_name=="ML1M_demographic":
    data = data[['user_id', 'item_id']]
    users = users[['user_id', 'gender','age','occupation', 'zipcode']]

    # create Male and Female columns
    users['male'] = users.gender.apply(lambda x: 1 if x == 'M' else 0)
    users['female'] = users.gender.apply(lambda x: 1 if x == 'F' else 0)

    # create columns of the first zipcode digite
    users['zipcode'] = users.zipcode.apply(lambda x: int(str(x)[0]))
    zipcode_df = pd.get_dummies(users.zipcode, prefix='zipcode', dtype='int')
    users = pd.concat([users, zipcode_df], axis=1).drop(columns=['zipcode'])

    # create age columns
    age_df = pd.get_dummies(users.age, prefix='age>=', dtype='int')
    users = pd.concat([users, age_df], axis=1).drop(columns=['age'])

    # create occupation columns
    occupation_df = pd.get_dummies(users.occupation, prefix='occupation', dtype='int')
    users = pd.concat([users, occupation_df], axis=1).drop(columns=['occupation'])

    users = users.drop(columns=['gender', 'user_id'])


##  transform the data to encoding representation

In [15]:
# transform the data to encoding representation
user_group = data[["user_id","item_id"]].groupby(data.user_id)

users_data = pd.DataFrame(
    data={
        "user_id": list(user_group.groups.keys()),
        "item_ids": list(user_group.item_id.apply(list)),
    }    
)


In [16]:
mlb = MultiLabelBinarizer()
user_one_hot = pd.DataFrame(mlb.fit_transform(users_data["item_ids"]),columns=mlb.classes_, index=users_data["item_ids"].index)

In [17]:
if data_name=="ML1M_demographic":
    user_one_hot = pd.concat([user_one_hot, users], axis=1)
user_one_hot["user_id"]=users_data["user_id"]

In [18]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(user_one_hot.iloc[:,:-1], user_one_hot.iloc[:,-1], test_size=0.2, random_state=42)

In [19]:
X_train.reset_index(drop=True, inplace=True)

In [20]:
X_test.index = np.arange(X_train.shape[0], num_users)

In [None]:
X_test.to_csv(Path(files_path, f'test_data_{data_name}.csv'))

In [None]:
X_train.to_csv(Path(files_path, f'train_data_{data_name}.csv'))

In [None]:
movies.to_csv(Path(files_path, f'movies.csv'))

# 3. Create dictionaries for baselines

In [21]:
num_features = X_train.shape[1]

### Jaccard dictionary

In [22]:
data_array = X_train.to_numpy() #np array of one hot, shape (|U_train|,|I|)

In [23]:
jaccard_dict = {}
for i in range(num_features):
    for j in range(i, num_features):
        intersection = (data_array[:,i]*data_array[:,j]).sum()
        union = np.count_nonzero(data_array[:,i]+data_array[:,j])
        if union == 0:
            jaccard_dict[(i,j)]=0
        else:
            jaccard_dict[(i,j)]=(intersection/union).astype('float32')

KeyboardInterrupt: 

In [None]:
file_path = Path(files_path, f'jaccard_based_sim_{data_name}.pkl')

with open(file_path, 'wb') as f:
    pickle.dump(jaccard_dict, f)

### Cosine dictionary

In [None]:
cosine_items = cosine_similarity(X_train.T).astype('float32')

In [None]:
cosine_items_dict = {}

# Loop through the rows and columns of the ndarray and add each element to the dictionary
for i in range(cosine_items.shape[0]):
    for j in range(i,cosine_items.shape[1]):
        cosine_items_dict[(i, j)] = cosine_items[i][j]

In [None]:
file_path = Path(files_path, f'cosine_based_sim_{data_name}.pkl')

with open(file_path, 'wb') as f:
    pickle.dump(cosine_items_dict, f)

### Popularity dictionary

In [None]:
pop_array = (X_train.sum(axis=0)/X_train.sum(axis=0).max())
pop_dict = {}

for i in range(num_items):
    pop_dict[i]=pop_array[i]
    

In [None]:
file_path = Path(files_path, f'pop_dict_{data_name}.pkl')

with open(file_path, 'wb') as f:
    pickle.dump(pop_dict, f)

### TF-IDF

In [None]:
data_array = pd.concat([X_train, X_test], axis=0).to_numpy() #np array of one hot, shape (|U|,|I|)

In [None]:
w_count = user_one_hot.iloc[:,:-1].sum(axis=1) # numer of items in user's history, shape = |U|

n_appearance = user_one_hot.iloc[:,:-1].sum(axis=0) # number of appearances of item in user histories, shape = |I|

In [None]:
n_appearance.index = np.arange(num_features)

In [None]:
tf_idf_dict = defaultdict(dict)
for u in range(num_users):
    for i in range(num_features):
        if data_array[u,i] == 1:
            tf = 1/w_count[u]
            idf = np.log10(num_users/n_appearance[i])
            tf_idf_dict[u][i] = tf*idf

In [None]:
file_path = Path(files_path, f'tf_idf_dict_{data_name}.pkl')

with open(file_path, 'wb') as f:
    pickle.dump(tf_idf_dict, f)

### Create static_test_data

In [None]:
np.random.seed(42)
static_test_data = X_test.copy()
matrix = np.array(static_test_data)[:,:num_items] # keep only items columns, remove demographic features columns
zero_indices = []
one_indices = []

for row in matrix:
    zero_idx = np.where(row == 0)[0]
    one_idx = np.where(row == 1)[0]
    probs = pop_array[zero_idx]
    probs = probs/ np.sum(probs)

    sampled_zero = np.random.choice(zero_idx, p = probs) # sample negative interactions according to items popularity 
    zero_indices.append(sampled_zero)

    sampled_one = np.random.choice(one_idx) # sample positive interactions from user's history
    static_test_data.iloc[row, sampled_one] = 0
    one_indices.append(sampled_one)

static_test_data['pos'] = one_indices
static_test_data['neg'] = zero_indices

In [None]:
static_test_data.to_csv(Path(files_path,f'static_test_data_{data_name}.csv'))