In [1]:
import pandas as pd
import numpy as np
from Dataset import MovieLensDataset

In [2]:
INPUT_PATH = 'Data/u.data'

OUTPUT_PATH_TRAIN_EXP = 'Data/movielens.train_explicit_ds'
OUTPUT_PATH_TEST_EXP = 'Data/movielens.test_explicit_ds'
OUTPUT_PATH_TRAIN_IMP = 'Data/movielens.train_implicit_ds'
OUTPUT_PATH_TEST_IMP = 'Data/movielens.test_implicit_ds'
OUTPUT_PATH_TEST_USER_DATA = 'Data/test_users'
OUTPUT_PATH_TEST_ITEM_DATA = 'Data/test_items'
USER_FIELD = 'userID'

In [3]:
def get_train_test_df(transactions):
    print("Size of the entire dataset:{}".format(transactions.shape))
    transactions.sort_values(by = ['timestamp'], inplace = True)
    last_transaction_mask = transactions.duplicated(subset = {USER_FIELD}, keep = "last")
    
    train_df = transactions[last_transaction_mask]
    test_df = transactions[~last_transaction_mask]
    
    train_df.sort_values(by=["userID", 'timestamp'], inplace = True)
    test_df.sort_values(by=["userID", 'timestamp'], inplace = True)
    return train_df, test_df

In [4]:
def report_stats(transactions, train_df, test_df):
    whole_size = transactions.shape[0]*1.0
    train_size = train_df.shape[0]
    test_size = test_df.shape[0]
    print("Total No. of Records = {}".format(whole_size))
    print("Train size = {}, Test size = {}".format(train_size, test_size))
    print("Train % = {}, Test % ={}".format(train_size/whole_size, test_size/whole_size))

In [5]:
def save_to_csv(df,path, sep = '\t'):
    df.to_csv(path, header = False, index = False, sep = sep)

In [6]:
def split():
    transactions = pd.read_csv(INPUT_PATH, sep="\t", names = ['userID', 'movieID', 'rating', 'timestamp'], engine = 'python')
    
    train_df, test_df = get_train_test_df(transactions)
    report_stats(transactions, train_df, test_df)
    return train_df, test_df

In [7]:
def save_train_test_split():
    train_df, test_df = split()
    save_to_csv(train_df, OUTPUT_PATH_TRAIN_EXP)
    save_to_csv(test_df, OUTPUT_PATH_TEST_EXP)

    train_df['rating'] = 1
    test_df['rating'] = 1
    save_to_csv(train_df, OUTPUT_PATH_TRAIN_IMP)
    save_to_csv(test_df, OUTPUT_PATH_TEST_IMP)

In [8]:
def load_rating_file_as_list(filename):
    ratingList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item = int(arr[0]), int(arr[1])
            ratingList.append([user, item])
            line = f.readline()
    return ratingList

In [9]:
def create_negative_file(test_ratings, train_ratings, num_items, num_samples=100):
    negativeList = []
    for user_item_pair in test_ratings:
        user = user_item_pair[0]
        item = user_item_pair[1]
        negatives = []
        for t in range(num_samples):
            j = np.random.randint(1, num_items)
            while [user, j] in train_ratings or j == item:
                j = np.random.randint(1, num_items)
            negatives.append(j)
        negativeList.append(negatives)
    return negativeList

In [10]:
def save_test_data():
    num_negatives_test = 100
    train_ratings = load_rating_file_as_list(OUTPUT_PATH_TRAIN_IMP)
    num_items = max([rating[1] for rating in train_ratings])

    test_ratings = load_rating_file_as_list(OUTPUT_PATH_TEST_IMP)
    test_negatives = create_negative_file(test_ratings, train_ratings, num_items, num_negatives_test)
    
    test_items = [negatives + [rating[1]] for rating, negatives in zip(test_ratings, test_negatives)]
    test_users = [np.full(len(items), rating[0], dtype='int32') for rating, items in zip(test_ratings, test_items)]
    test_items = np.array(test_items)
    test_users = np.array(test_users)
    np.save(OUTPUT_PATH_TEST_ITEM_DATA, test_items)
    np.save(OUTPUT_PATH_TEST_USER_DATA, test_users)

In [11]:
save_train_test_split()
save_test_data()

Size of the entire dataset:(100000, 4)
Total No. of Records = 100000.0
Train size = 99057, Test size = 943
Train % = 0.99057, Test % =0.00943


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
