In [3]:
import random
import numpy as np
import torch
import pandas as pd
from MetaMF import *
from collections import defaultdict
import matplotlib.pyplot as plt
from datetime import datetime as dt
import os.path

# Initialize random seeds and select GPU

In [2]:
random.seed(1)
np.random.seed(1)
torch.manual_seed(1) # set random seed for cpu
torch.cuda.manual_seed(1) # set random seed for current gpu
torch.cuda.manual_seed_all(1) # set random seed for all gpus

In [3]:
if torch.cuda.is_available():
    use_cuda = True
    torch.cuda.set_device(0)
else:
    use_cuda = False
print("CUDA available? " + str(use_cuda))
if use_cuda:
    print("Current device: %d" % torch.cuda.current_device())

CUDA available? False


# Utility Functions
## Read Dataset

In [5]:
def read_dataset(path):
    trainset = pd.read_csv(path + ".train.rating", sep = ',', header=None).to_records(index=False).tolist()
    valset = pd.read_csv(path + ".valid.rating", sep = ',', header=None).to_records(index=False).tolist()
    testset = pd.read_csv(path + ".test.rating", sep = ',', header=None).to_records(index=False).tolist()
    
    return trainset, valset, testset

def read_usergroups(path):
    low_users = pd.read_csv(path + "_low.userlist", header=None, squeeze=True).values.tolist()
    med_users = pd.read_csv(path + "_med.userlist", header=None, squeeze=True).values.tolist()
    high_users = pd.read_csv(path + "_high.userlist", header=None, squeeze=True).values.tolist()
    
    return low_users, med_users, high_users

def read_useranditemlist(path):
    userlist = pd.read_csv(path + ".userlist", header=None, squeeze=True).values.tolist()
    itemlist = pd.read_csv(path + ".itemlist", header=None, squeeze=True).values.tolist()
    
    return userlist, itemlist

## Helpers for Model Training

In [6]:
def batchtoinput(batch, use_cuda):
    users = []
    items = []
    ratings = []
    timestamps = []
    for example in batch:
        users.append(example[0])
        items.append(example[1])
        ratings.append(example[2])
        timestamps.append(example[3])
    users = torch.tensor(users, dtype=torch.int64)
    items = torch.tensor(items, dtype=torch.int64)
    ratings = torch.tensor(ratings, dtype=torch.float32)
    timestamps = torch.tensor(timestamps, dtype=torch.int64)
    if use_cuda:
        users = users.cuda()
        items = items.cuda()
        ratings = ratings.cuda()
        timestamps = timestamps.cuda()
    return users, items, ratings, timestamps

def getbatches(traindata, batch_size, use_cuda, shuffle):
    dataset = traindata.copy()
    if shuffle:
        random.shuffle(dataset)
    for batch_i in range(0,int(np.ceil(len(dataset)/batch_size))):
        start_i = batch_i*batch_size
        batch = dataset[start_i:start_i+batch_size]
        yield batchtoinput(batch, use_cuda)
        
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data, 0)
        
def get_eval(ratlist, predlist):
    mae = np.mean(np.abs(ratlist-predlist))
    mse = np.mean(np.square(ratlist-predlist))       
    return  mae, mse

# Other Functions
## Random Sampling Procedure
Here, we implement a sampling procedure to simulate a privacy budget $\beta$. In detail, we randomly select a fraction of $\beta$ of each user's rating data to be shared with the model. Thus, a user holds back a fraction of $1-\beta$ of her data and provides only a fraction of $\beta$ of her data for model training.

In [7]:
def sampling_procedure(dataset, beta):
    dataframe = pd.DataFrame(dataset, columns=["user_id", "item_id", "rating"])
    n_samples = np.ceil(dataframe.groupby("user_id").size() * (beta)).astype(int)
    new_dataset = []
    for uid, group in dataframe.groupby("user_id"):
        new_dataset.extend(group.sample(n=n_samples.loc[uid]).to_records(index=False).tolist())
    return new_dataset

## Sampling Procedure by time
This sampling procedure takes the timestamp of a rating into account when creating training subsets. 

In [82]:
def sampling_procedure(dataset, beta):
    dataframe = pd.DataFrame(dataset, columns=["user_id", "item_id", "rating", "timestamp"])
    n_samples = dataframe.sort_values(by=['timestamp', 'user_id'],ascending=False).groupby('user_id').size().reset_index() 
    n_samples.columns = ['user_id','count']
    n_samples.loc[:,'count'] = n_samples[['count']] * (beta)
    n_samples.loc[:,'count'] = n_samples['count'].astype(int)
    users = dataframe['user_id'].unique().tolist()
    new_dataset = []
    for user in users:
        userID = n_samples.loc[n_samples['user_id']==user,'count'].values[0]
        if userID >= 0:
            temp_df = dataframe[dataframe['user_id']==user]
            temp_df = temp_df.iloc[:userID,:]
            new_dataset.extend(temp_df.values)
    new_dataset = pd.DataFrame(new_dataset, columns=dataframe.columns)
    new_dataset.to_csv(r'C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m_shown09.csv', index = False)
    return new_dataset    

Then, in case the temporal sampling procedure was used, the data that is to be augmented is identified:

In [83]:
train = pd.read_csv('C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m.train.rating', names=["user_id", "item_id", "rating", "timestamp"])
RecData = sampling_procedure(train, beta=0.9)

   user_id  item_id  rating  timestamp
0        0        0     5.0  978300760
1        0        1     3.0  978302109
2        0        2     3.0  978301968
3        0        3     4.0  978300275
4        0        5     3.0  978302268
   user_id  item_id  rating    timestamp
0      0.0      0.0     5.0  978300760.0
1      0.0      1.0     3.0  978302109.0
2      0.0      2.0     3.0  978301968.0
3      0.0      3.0     4.0  978300275.0
4      0.0      5.0     3.0  978302268.0


In [76]:
#Some datatypes have to be corrected

ff = train.select_dtypes(include = ['float64'])
for col in ff.columns.values:
    train[col] = train[col].astype('int64')
print(train)

cc = RecData.select_dtypes(include = ['float64'])
for col in cc.columns.values:
    RecData[col] = RecData[col].astype('int64')
print(RecData)

        user_id  item_id  rating  timestamp
0             0        0       5  978300760
1             0        1       3  978302109
2             0        2       3  978301968
3             0        3       4  978300275
4             0        5       3  978302268
...         ...      ...     ...        ...
810278     6039      772       1  956716541
810279     6039     1106       5  956704887
810280     6039      365       5  956704746
810281     6039      152       4  956715648
810282     6039       26       4  956715569

[810283 rows x 4 columns]
        user_id  item_id  rating  timestamp
0             0        0       5  978300760
1             0        1       3  978302109
2             0        2       3  978301968
3             0        3       4  978300275
4             0        5       3  978302268
...         ...      ...     ...        ...
726525     6039      604       1  956716407
726526     6039     1049       2  956715942
726527     6039     2425       3  956716157
72652

In [84]:
#Identify the hidden data as all train data minus the rows used in RecData

cols = ['user_id', 'item_id', 'rating', 'timestamp']
df2 = train.set_index(cols)
RecData2 = RecData.set_index(cols)
hidden_data = train[~df2.index.isin(RecData2.index)]

hidden_data.to_csv('C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m_hidden10p.csv', index = False)

In [6]:
shown = pd.read_csv('C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m_shown09.csv')

reshape = shown.select_dtypes(include = ['float64'])
for col in reshape.columns.values:
    reshape[col] = reshape[col].astype('int64')

reshape.to_csv('C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m_shown09p.csv', index = False)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m_shown09.csv'

In [17]:
#Then merge and sort both the augmented and original dataset back into one full training set

dataS = pd.read_csv("C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/ml-1m_shown60p.csv")
df_shown = pd.DataFrame(dataS)
del df_shown["timestamp"]

dataH = pd.read_csv("C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/augmented/sd40p.csv")
df_hidden = pd.DataFrame(dataH)

total = [df_shown, df_hidden]
final = pd.concat(total)
new_train_set = final.sort_values(by=['user_id', 'item_id'], ascending=True)
new_train_set.to_csv("C:/Users/fleur/Thesis B3/RQ0/RobustnessOfMetaMF-master/RobustnessOfMetaMF-master/ThesisData/ml-1m/ml-1m_time/augmented/ml-1m06p.train.rating", index=False)

## Run Experiments
This method trains and tests MetaMF and NoMetaMF under ten different privacy budgets (i.e., $\beta \in \{1.0, 0.9, \dots, 0.1\}$). For NoMetaMF, meta learning the parameters of the rating prediction model can be disabled. Furthermore, we evaluate the model's accuracy in terms of the mean squared error and the mean absolute error on both, all users in the dataset and on our three user groups (i.e., $Low, Med, High$).

In [7]:
def run(path, traindata, valdata, testdata, userlist, itemlist, low, med, high, hyperparameters, betas=None, disable_meta_learning=False, save=False):
    # default choice of privacy budget beta
    if betas is None:
        betas = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
        
    if os.path.exists(path + "/results.csv"):
        results_df = pd.read_csv(path + "/results.csv")
    else:
        results_df = pd.DataFrame()
        
    for beta in betas:
        results_dict = {"beta": beta}
        model_name = "beta_" + str(int(beta*100)) + "p"  
        print("==========================")
        print(model_name)
        print("==========================")
        starttime = dt.now()
        
        # sample a fraction of beta of each user's data to simulate privacy budget beta
        R_train_beta = sampling_procedure(traindata, beta)
        
        train_loss, validation_loss = [], []
        net = MetaMF(len(userlist), len(itemlist))
        
        # disable meta learning for NoMetaMF
        if disable_meta_learning:
            net.disable_meta_learning()
        
        # initialize parameters of neural network
        net.apply(weights_init)
        if use_cuda:
            net.cuda()
        
        # model training
        optimizer = optim.Adam(net.parameters(), lr=hyperparameters["lr"], weight_decay=hyperparameters["lambda"])
        batch_size = hyperparameters["batch_size"]
        n_epochs = hyperparameters["n_epochs"]
        
        for epoch in range(n_epochs):
            net.train()
            error = 0
            num = 0
            for k, (users, items, ratings, timestamps) in enumerate(getbatches(R_train_beta, batch_size, use_cuda, True)):
                optimizer.zero_grad()
                pred = net(users, items)

                loss = net.loss(pred, ratings)
                loss.backward()
                nn.utils.clip_grad_norm_(net.parameters(), 5)
                optimizer.step()
                error += loss.detach().cpu().numpy()*len(users)
                num += len(users)
            train_loss.append(error/num)
            
            # evaluate training error
            net.eval()
            groundtruth, estimation = [], []
            for users, items, ratings, timestamps in getbatches(valdata, batch_size, use_cuda, False):
                predictions = net(users, items)
                estimation.extend(predictions.tolist())
                groundtruth.extend(ratings.tolist())
            mae, mse = get_eval(np.array(groundtruth), np.array(estimation))
            validation_loss.append(mse)
            
            print('Epoch {}/{} - Training Loss: {:.3f}, Validation Loss: {:.3f}, Time Elapsed: {}'.format(epoch+1, n_epochs, error/num, mse, dt.now()-starttime))
            
            if epoch+1 == n_epochs:
                if save:
                    torch.save(net, path + "/" + model_name + '.model')
                    print("Saved Model to " + path)
                
                results_dict["train_mse_all"] = error / num
                results_dict["val_mse_all"] = mse
        
        # plot training and validation error to observe convergence
        net.eval()
        plt.figure()
        plt.plot(range(n_epochs), train_loss, label="Train")
        plt.plot(range(n_epochs), validation_loss, label="Val")
        plt.legend()
        plt.ylabel("MSE")
        plt.xlabel("Epoch")
        plt.tight_layout()
        
        # evaluate test error on both, all users in the dataset and on our three user groups
        groundtruth, estimation = [], []
        group_groundtruth = defaultdict(list)
        group_estimation = defaultdict(list)
        for users, items, ratings, timestamps in getbatches(testdata, batch_size, use_cuda, False):
            predictions = net(users, items)
            estimation.extend(predictions.tolist())
            groundtruth.extend(ratings.tolist())
            
            for uid, iid, r, p in zip(users.cpu().numpy(), items.cpu().numpy(), ratings.cpu().numpy(), predictions.detach().cpu().numpy()):
                if uid in low:
                    group_groundtruth["low"].append(r)
                    group_estimation["low"].append(p)
                elif uid in med:
                    group_groundtruth["med"].append(r)
                    group_estimation["med"].append(p)
                elif uid in high:
                    group_groundtruth["high"].append(r)
                    group_estimation["high"].append(p)
        
        test_mae, test_mse = get_eval(np.array(groundtruth), np.array(estimation))
        low_mae, low_mse = get_eval(np.array(group_groundtruth["low"]), np.array(group_estimation["low"]))
        med_mae, med_mse = get_eval(np.array(group_groundtruth["med"]), np.array(group_estimation["med"]))
        high_mae, high_mse = get_eval(np.array(group_groundtruth["high"]), np.array(group_estimation["high"]))
        
        results_dict["test_mse_all"] = test_mse
        results_dict["test_mae_all"] = test_mae
        results_dict["test_mse_low"] = low_mse
        results_dict["test_mae_low"] = low_mae
        results_dict["test_mse_med"] = med_mse
        results_dict["test_mae_med"] = med_mae
        results_dict["test_mse_high"] = high_mse
        results_dict["test_mae_high"] = high_mae
        
        print(results_dict)
        if save:
            plt.savefig(path + "/" + model_name + ".png", dpi=300)
            results_df = results_df.append(pd.DataFrame([results_dict]))
            results_df.to_csv(path + "/results.csv", index=False)
            print("Saved Results to " + path)

### MovieLens 1M run

In [8]:
train, val, test = read_dataset("ThesisData/ml-1m/ml-1m/ml-1m")
users, items = read_useranditemlist("ThesisData/ml-1m/ml-1m/ml-1m")

low, med, high = read_usergroups("data/User Groups/time/ml1m")
run("experiments/timemeta/ml1m", train, val, test, users, items, low, med, high, save=True, disable_meta_learning=False, betas=[0.1],
    hyperparameters={"lr": 0.0001, "lambda": 0.001, "batch_size": 64, "n_epochs": 1})

beta_10p


KeyboardInterrupt: 

### MovieLens Latest Small run

In [8]:
train, val, test = read_dataset("ThesisData/ml-latest-small/ml-latest-small/ml-latest-small")
users, items = read_useranditemlist("ThesisData/ml-latest-small/ml-latest-small/ml-latest-small")

low, med, high = read_usergroups("data/User Groups/mllatestsmall")
run("experiments/meta/ml-latest-small", train, val, test, users, items, low, med, high, save=True, disable_meta_learning=False, 
    hyperparameters={"lr": 0.0001, "lambda": 0.001, "batch_size": 64, "n_epochs": 100})

beta_100p


RuntimeError: Invalid index in scatter at C:\Users\builder\AppData\Local\Temp\pip-req-build-e5c8dddg\aten\src\TH/generic/THTensorEvenMoreMath.cpp:151

### MovieLens 100K run

In [10]:
train, val, test = read_dataset("ThesisData/ml-100k/ml-100k/ml-100k")
users, items = read_useranditemlist("ThesisData/ml-100k/ml-100k/ml-100k")

low, med, high = read_usergroups("data/User Groups/ml100k")
run("experiments/timemeta/ml100k", train, val, test, users, items, low, med, high, save=True, disable_meta_learning=False, 
    hyperparameters={"lr": 0.0001, "lambda": 0.001, "batch_size": 64, "n_epochs": 1})

beta_100p


ValueError: No axis named timestamp for object type DataFrame