In [502]:

import pandas as pd
import json
import random
import os
from collections import OrderedDict
import argparse 
#silence warnings
import warnings
warnings.filterwarnings("ignore")



parser = argparse.ArgumentParser(description='Generate train, validation, and test splits for MovieLens dataset.')
parser.add_argument('--data_name', type=str, choices=['ml-1m', 'ml-100k','books','goodbooks','netflix'], default='ml-1m',
                        help='Name of the MovieLens dataset (ml-1m or ml-100k). Default is ml-1m.')
parser.add_argument('--timestamp', action='store_true')


args = parser.parse_args()
data_name = args.data_name

def save_sorted_json(data, filename):
    sorted_data = OrderedDict(sorted(data.items(), key=lambda t: t[0]))
    with open(filename, 'w') as f:
        json.dump(sorted_data, f, indent=4)


def extract_unique_movies(user_data):
    movies = set()
    for user_movies in user_data.values():
        for genre_movies in user_movies.values():
            if isinstance(genre_movies, list):  # normal train, valid, test set
                for movie in genre_movies:
                    movies.add(movie['title'])
            else:  # leave-one-out valid and test set
                movies.add(genre_movies['title'])
    return movies


def extract_user_genres(data):
    """Extract genres a user has interacted with."""
    user_genres = {}

    for user_id, genres in data.items():
        user_genres[user_id] = list(genres.keys())

    return user_genres

def load_movie_titles_from_dat(file_path):
    with open(file_path, 'r') as f:
        lines = f.read().decode('latin-1')
    movie_titles = set()
    for line in lines:
        tokens = line.split("::")
        title = tokens[1]
        movie_titles.add(title)
    return movie_titles

def filter_movie_titles_by_valid_set(movies, valid_item_names):
    return [movie for movie in movies if movie['title'] in valid_item_names]

def sample_random(user_movies):
    validation_movie = user_movies.sample(n=1)

    user_movies = user_movies.drop(validation_movie.index)
    

    # Randomly choose one movie for test and remove it from user's ratings
    test_movie = user_movies.sample(n=1)
    user_movies = user_movies.drop(test_movie.index)
    return validation_movie,test_movie,user_movies

def sample_most_recent(user_movies):
    # Sort the DataFrame by the 'timestamp' column in descending order
    user_movies = user_movies.sort_values(by='timestamp', ascending=False)

    # Sample the two most recent movies
    most_recent_movies = user_movies.head(2)

    # Drop the two most recent movies from the user's ratings
    user_movies = user_movies.drop(most_recent_movies.index)

    # Split the two most recent movies into validation_movie and test_movie
    validation_movie = most_recent_movies.head(1)
    test_movie = most_recent_movies.tail(1)

    return validation_movie, test_movie, user_movies

    

def split_and_filter_ratings(user_movies, rating_threshold=4):
    # Sort the DataFrame by the 'timestamp' column in descending order and select the first 52 movies
    user_movies = user_movies.sort_values(by='timestamp', ascending=False) if args.timestamp else user_movies
    if user_movies.shape[0] < 52:
        #take the last two movies that are rated higher than the threshold
        validation_test_movies = user_movies[user_movies['rating'] >= rating_threshold].tail(2)
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        #use the rest for train and prompt set
        training_set = prompt_movies= user_movies.drop(validation_test_movies.index)

    else:
        candidate_recent_movies = user_movies.head(52)
        validation_test_movies = candidate_recent_movies[candidate_recent_movies['rating'] >= rating_threshold].tail(2)
        
        # Select the first two movies from the high-rated movies as validation and test sets

        # If there are no high-rated movies or validation_candidatess_recent_movies is empty, set validation and test sets to None

        # Split the selected movies into validation and test sets
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        training_set = user_movies.drop(validation_test_movies.index)
        prompt_movies = training_set.head(50)
    if validation_test_movies.shape[0] < 2:
        return None,None,None,None
    return prompt_movies, validation_set, test_set,training_set

def generate_train_val_test_splits(ratings, k):
    # Count the number of ratings for each user
    user_counts = ratings['userId'].value_counts()

    # Filter out users with fewer than k ratings
    eligible_users = user_counts[user_counts > k].index
  

    # Initialize empty lists for training, validation, and test data
    train_data = []
    val_data = []
    test_data = []
    prompt_set =[]

    # Iterate through eligible users
    non_users = []
    for user_id in eligible_users:
        # Extract user's ratings
        # user_movies = user_ratings.merge(movie_metadata, left_on='itemId', right_on='movielens_id')
        user_movies = ratings[ratings['userId'] == user_id]
        # user_movies = user_movies.dropna(subset=['summary'])  # Remove movies with 'NaN' summary
        # user_movies = user_movies[user_movies['title'].isin(valid_item_names)]
        movie_set = set(user_movies['itemId'])


        # Randomly choose one movie for validation and remove it from user's ratings
        prompt_movies,validation_movie, test_movie,training_set = split_and_filter_ratings(user_movies)
        if prompt_movies is None:
            non_users.append(user_id)
            continue

        # Append user's training data
        prompt_set.append(prompt_movies)
        train_data.append(training_set)

        # Append user's validation data
        val_data.append(validation_movie)

        # Append user's test data
        test_data.append(test_movie)
        
        
 
    # Concatenate the dataframes to get the final splits
    train_data = pd.concat(train_data)
    val_data = pd.concat(val_data)
    test_data = pd.concat(test_data)
    promp_set = pd.concat(prompt_set)

    return train_data, val_data, test_data,promp_set,non_users


In [527]:
import pandas as pd
import json
import random
import os
from collections import OrderedDict
import argparse 
#silence warnings
import warnings
warnings.filterwarnings("ignore")



parser = argparse.ArgumentParser(description='Generate train, validation, and test splits for MovieLens dataset.')
parser.add_argument('--data_name', type=str, choices=['ml-1m', 'ml-100k'], default='ml-1m',
                        help='Name of the MovieLens dataset (ml-1m or ml-100k). Default is ml-1m.')
parser.add_argument('--timestamp', action='store_true')


args = parser.parse_args([])
data_name = args.data_name

def save_sorted_json(data, filename):
    sorted_data = OrderedDict(sorted(data.items(), key=lambda t: t[0]))
    with open(filename, 'w') as f:
        json.dump(sorted_data, f, indent=4)


def extract_unique_movies(user_data):
    movies = set()
    for user_movies in user_data.values():
        for genre_movies in user_movies.values():
            if isinstance(genre_movies, list):  # normal train, valid, test set
                for movie in genre_movies:
                    movies.add(movie['title'])
            else:  # leave-one-out valid and test set
                movies.add(genre_movies['title'])
    return movies


def extract_user_genres(data):
    """Extract genres a user has interacted with."""
    user_genres = {}

    for user_id, genres in data.items():
        user_genres[user_id] = list(genres.keys())

    return user_genres

def load_movie_titles_from_dat(file_path):
    with open(file_path, 'r') as f:
        lines = f.read().decode('latin-1')
    movie_titles = set()
    for line in lines:
        tokens = line.split("::")
        title = tokens[1]
        movie_titles.add(title)
    return movie_titles

def filter_movie_titles_by_valid_set(movies, valid_movie_titles):
    return [movie for movie in movies if movie['title'] in valid_movie_titles]

def sample_random(user_movies):
    validation_movie = user_movies.sample(n=1)

    user_movies = user_movies.drop(validation_movie.index)
    

    # Randomly choose one movie for test and remove it from user's ratings
    test_movie = user_movies.sample(n=1)
    user_movies = user_movies.drop(test_movie.index)
    return validation_movie,test_movie,user_movies

def sample_most_recent(user_movies):
    # Sort the DataFrame by the 'timestamp' column in descending order
    user_movies = user_movies.sort_values(by='timestamp', ascending=False)

    # Sample the two most recent movies
    most_recent_movies = user_movies.head(2)

    # Drop the two most recent movies from the user's ratings
    user_movies = user_movies.drop(most_recent_movies.index)

    # Split the two most recent movies into validation_movie and test_movie
    validation_movie = most_recent_movies.head(1)
    test_movie = most_recent_movies.tail(1)

    return validation_movie, test_movie, user_movies

    

def split_and_filter_ratings(user_movies, rating_threshold=4):
    # Sort the DataFrame by the 'timestamp' column in descending order and select the first 52 movies
    user_movies = user_movies.sort_values(by='timestamp', ascending=False)
    if user_movies.shape[0] < 52:
        #take the last two movies that are rated higher than the threshold

        validation_test_movies = user_movies[user_movies['rating'] >= rating_threshold].tail(2)
        # validation_test_movies = user_movies[user_movies['rating'] >= rating_threshold].head(2)
        # validation_test_movies = user_movies.tail(2)
      
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        #use the rest for train and prompt set
        training_set = prompt_movies= user_movies.drop(validation_test_movies.index)
        # raise Exception


    else:
        candidate_recent_movies = user_movies.head(52)
        validation_test_movies = candidate_recent_movies[candidate_recent_movies['rating'] >= rating_threshold].tail(2)
        # if user_movies.userId.unique()[0] == 5:

        #     display(user_movies)
            
            # raise Exception
        # validation_test_movies = candidate_recent_movies[candidate_recent_movies['rating'] >= rating_threshold].tail(2)
        
        # Select the first two movies from the high-rated movies as validation and test sets

        # If there are no high-rated movies or validation_candidatess_recent_movies is empty, set validation and test sets to None

        # Split the selected movies into validation and test sets
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        training_set = user_movies.drop(validation_test_movies.index)
        prompt_movies = training_set.head(50)
    if validation_test_movies.shape[0] < 2:
        return None,None,None,None
    return prompt_movies, validation_set, test_set,training_set

def generate_train_val_test_splits(ratings, k, movie_metadata):
    # Count the number of ratings for each user
    user_counts = ratings['userId'].value_counts()

    # Filter out users with fewer than k ratings
    eligible_users = user_counts[user_counts > k].index
  

    # Initialize empty lists for training, validation, and test data
    train_data = []
    val_data = []
    test_data = []
    prompt_set =[]

    # Iterate through eligible users
    non_users = []
    for user_id in eligible_users:
            
        # Extract user's ratings
        user_ratings = ratings[ratings['userId'] == user_id]

        user_movies = user_ratings.merge(movie_metadata, left_on='itemId', right_on='movielens_id')

        # user_movies = user_movies.dropna(subset=['summary'])  # Remove movies with 'NaN' summary

        movie_set = set(user_movies['itemId'])


        # Randomly choose one movie for validation and remove it from user's ratings
        prompt_movies,validation_movie, test_movie,training_set  =split_and_filter_ratings(user_movies)
        if prompt_movies is None:
            non_users.append(user_id)
            continue
  
            # raise Exception

        # Append user's training data
        prompt_set.append(prompt_movies)
        train_data.append(training_set)

        # Append user's validation data
        val_data.append(validation_movie)

        # Append user's test data
        test_data.append(test_movie)
        
        
 
    # Concatenate the dataframes to get the final splits
    train_data = pd.concat(train_data)
    val_data = pd.concat(val_data)
    test_data = pd.concat(test_data)
    promp_set = pd.concat(prompt_set)

    return train_data, val_data, test_data,promp_set,non_users


In [549]:
def save_sorted_json(data, filename):
    sorted_data = OrderedDict(sorted(data.items(), key=lambda t: t[0]))
    with open(filename, 'w') as f:
        json.dump(sorted_data, f, indent=4)


def extract_unique_movies(user_data):
    movies = set()
    for user_movies in user_data.values():
        for genre_movies in user_movies.values():
            if isinstance(genre_movies, list):  # normal train, valid, test set
                for movie in genre_movies:
                    movies.add(movie['title'])
            else:  # leave-one-out valid and test set
                movies.add(genre_movies['title'])
    return movies


def extract_user_genres(data):
    """Extract genres a user has interacted with."""
    user_genres = {}

    for user_id, genres in data.items():
        user_genres[user_id] = list(genres.keys())

    return user_genres

def load_movie_titles_from_dat(file_path):
    with open(file_path, 'r') as f:
        lines = f.read().decode('latin-1')
    movie_titles = set()
    for line in lines:
        tokens = line.split("::")
        title = tokens[1]
        movie_titles.add(title)
    return movie_titles

def filter_movie_titles_by_valid_set(movies, valid_item_names):
    return [movie for movie in movies if movie['title'] in valid_item_names]

def sample_random(user_movies):
    validation_movie = user_movies.sample(n=1)

    user_movies = user_movies.drop(validation_movie.index)
    

    # Randomly choose one movie for test and remove it from user's ratings
    test_movie = user_movies.sample(n=1)
    user_movies = user_movies.drop(test_movie.index)
    return validation_movie,test_movie,user_movies

def sample_most_recent(user_movies):
    # Sort the DataFrame by the 'timestamp' column in descending order
    user_movies = user_movies.sort_values(by='timestamp', ascending=False)

    # Sample the two most recent movies
    most_recent_movies = user_movies.head(2)

    # Drop the two most recent movies from the user's ratings
    user_movies = user_movies.drop(most_recent_movies.index)

    # Split the two most recent movies into validation_movie and test_movie
    validation_movie = most_recent_movies.head(1)
    test_movie = most_recent_movies.tail(1)

    return validation_movie, test_movie, user_movies

    

def split_and_filter_ratings(user_movies, rating_threshold=4):
    # Sort the DataFrame by the 'timestamp' column in descending order and select the first 52 movies
    user_movies = user_movies.sort_values(by='timestamp', ascending=False) 
    if user_movies.shape[0] < 52:
        #take the last two movies that are rated higher than the threshold
        validation_test_movies = user_movies[user_movies['rating'] >= rating_threshold].tail(2)
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        #use the rest for train and prompt set
        training_set = prompt_movies= user_movies.drop(validation_test_movies.index)

    else:
        candidate_recent_movies = user_movies.head(52)
        validation_test_movies = candidate_recent_movies[candidate_recent_movies['rating'] >= rating_threshold].tail(2)
        
        # Select the first two movies from the high-rated movies as validation and test sets

        # If there are no high-rated movies or validation_candidatess_recent_movies is empty, set validation and test sets to None

        # Split the selected movies into validation and test sets
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        training_set = user_movies.drop(validation_test_movies.index)
        prompt_movies = training_set.head(50)
    if validation_test_movies.shape[0] < 2:
        return None,None,None,None
    return prompt_movies, validation_set, test_set,training_set

def generate_train_val_test_splits(ratings, k):
    # Count the number of ratings for each user
    user_counts = ratings['userId'].value_counts()

    # Filter out users with fewer than k ratings
    eligible_users = user_counts[user_counts > k].index
  

    # Initialize empty lists for training, validation, and test data
    train_data = []
    val_data = []
    test_data = []
    prompt_set =[]

    # Iterate through eligible users
    non_users = []
    for user_id in eligible_users:
        # Extract user's ratings
        # user_movies = user_ratings.merge(movie_metadata, left_on='itemId', right_on='movielens_id')
        user_movies = ratings[ratings['userId'] == user_id]
        # user_movies = user_movies.dropna(subset=['summary'])  # Remove movies with 'NaN' summary
        # user_movies = user_movies[user_movies['title'].isin(valid_item_names)]
        movie_set = set(user_movies['itemId'])


        # Randomly choose one movie for validation and remove it from user's ratings
        prompt_movies,validation_movie, test_movie,training_set = split_and_filter_ratings(user_movies)
        if prompt_movies is None:
            non_users.append(user_id)
            continue

        # Append user's training data
        prompt_set.append(prompt_movies)
        train_data.append(training_set)

        # Append user's validation data
        val_data.append(validation_movie)

        # Append user's test data
        test_data.append(test_movie)
        
        
 
    # Concatenate the dataframes to get the final splits
    train_data = pd.concat(train_data)
    val_data = pd.concat(val_data)
    test_data = pd.concat(test_data)
    promp_set = pd.concat(prompt_set)

    return train_data, val_data, test_data,promp_set,non_users

k = 20  # threshold for history length

print(f"{data_name=}")
if data_name == 'ml-1m':
    k = 10  # threshold for history length

    valid_item_names = pd.read_csv(f'../data/ml-1m/movies.dat',encoding='ISO-8859-1',sep='::',header=None).iloc[:,1].tolist()
    ratings_file = '../data/ml-1m/ratings.dat'
    separator = "::"
    header = None
    rating_columns = ['userId', 'itemId', 'rating', 'timestamp']
    movie_metadata_file = '../data/ml-1m/movies.dat'
    ratings = pd.read_csv(ratings_file, sep=separator, header=header, names=rating_columns)
    
    # Load movie metadata
    movie_metadata = pd.read_csv(movie_metadata_file,encoding='ISO-8859-1',sep='::',header=None)
    movie_metadata.columns = ['movielens_id','title','genre']


    ratings = ratings.merge(movie_metadata, left_on='itemId', right_on='movielens_id', how = 'left')
    # display(ratings)
    
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)

elif data_name == 'books':
    ratings_file = '../data/books/ratings.csv'
    ratings = pd.read_csv(ratings_file)
    ratings.rename(columns={'book_id':'itemId','review/time':'timestamp','Title':'title','review/score':'rating','User_id':'userId','categories':'genres'}, inplace=True)

    valid_item_names = ratings.title.unique().tolist()
    #rename the bookId column to itemId
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)
elif data_name == 'goodbooks':
    ratings_file = '../data/goodbooks/ratings.csv'
    ratings = pd.read_csv(ratings_file)
    #remove cold start items
    ratings = ratings[ratings['book_id'].isin(ratings['book_id'].value_counts()[ratings['book_id'].value_counts()>10].index)]
    
    ratings.rename(columns={'book_id':'itemId','Title':'title','user_id':'userId'}, inplace=True)
    valid_item_names = ratings.title.unique().tolist()
    #rename the bookId column to itemId
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)
elif data_name == 'netflix':
    ratings_file = '../data/netflix/ratings.csv'
    ratings = pd.read_csv(ratings_file)
    ratings.rename(columns={'MovieID':'itemId','Title':'title','CustomerID':'userId','Name':'title','Rating':'rating','Date':'timestamp'}, inplace=True)
    print(f"{ratings.columns=}")
    valid_item_names = ratings.title.unique().tolist()
    #rename the bookId column to itemId
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)

    
valid_item_names = list(valid_item_names)
display(train_data)
train_data = train_data[train_data['title'].isin(valid_item_names)]


val_data = val_data[val_data['title'].isin(valid_item_names)]
test_data = test_data[test_data['title'].isin(valid_item_names)]

# Check for overlapping user-movie pairs again after filtering
train_user_movie_pairs = set(zip(train_data['userId'], train_data['itemId']))
val_user_movie_pairs = set(zip(val_data['userId'], val_data['itemId']))
test_user_movie_pairs = set(zip(test_data['userId'], test_data['itemId']))

overlap_train_val = train_user_movie_pairs.intersection(val_user_movie_pairs)
overlap_train_test = train_user_movie_pairs.intersection(test_user_movie_pairs)
overlap_val_test = val_user_movie_pairs.intersection(test_user_movie_pairs)
num_users_train = len(set(train_data['userId']))
num_users_val = len(set(val_data['userId']))
num_users_test = len(set(test_data['userId']))



### Error checking 
assert not overlap_train_val, f"Overlap between train and validation sets in rows:\n{train_data[train_data[['userId', 'itemId']].apply(tuple, axis=1).isin(overlap_train_val)]}"
assert not overlap_train_test, f"Overlap between train and test sets in rows:\n{train_data[train_data[['userId', 'itemId']].apply(tuple, axis=1).isin(overlap_train_test)]}"
assert not overlap_val_test, f"Overlap between validation and test sets in rows:\n{val_data[val_data[['userId', 'itemId']].apply(tuple, axis=1).isin(overlap_val_test)]}"

print("No overlap found after filtering by valid_item_names.")
assert num_users_test == num_users_train and num_users_test == num_users_val, f'{num_users_val=} {num_users_train=} {num_users_test=}'
print( 'The total number of users is ', len(set(ratings['userId'])))
print(f"All sets have the same number of users used = {num_users_train}" , f"The number of non users is {len(non_users)}")

#randomly remove 500 users from the training validation and test split and make a joint set of these called strong generalization set 
random.seed(42)
user_set = set(train_data['userId']) | set(val_data['userId']) | set(test_data['userId'])
user_set = random.sample(list(user_set) ,500)
strong_generalization_set = pd.concat([train_data[train_data['userId'].isin(user_set)],val_data[val_data['userId'].isin(user_set)],test_data[test_data['userId'].isin(user_set)]])

train_data = train_data[~train_data['userId'].isin(user_set)]
val_data = val_data[~val_data['userId'].isin(user_set)]
test_data = test_data[~test_data['userId'].isin(user_set)]


In [550]:
k = 20  # threshold for history length

print(f"{data_name=}")
if data_name == 'ml-1m':
    k = 10  # threshold for history length

    valid_item_names = pd.read_csv(f'../data/ml-1m/movies.dat',encoding='ISO-8859-1',sep='::',header=None).iloc[:,1].tolist()
    ratings_file = '../data/ml-1m/ratings.dat'
    separator = "::"
    header = None
    rating_columns = ['userId', 'itemId', 'rating', 'timestamp']
    movie_metadata_file = '../data/ml-1m/movies.dat'
    ratings = pd.read_csv(ratings_file, sep=separator, header=header, names=rating_columns)
    
    # Load movie metadata
    movie_metadata = pd.read_csv(movie_metadata_file,encoding='ISO-8859-1',sep='::',header=None)
    movie_metadata.columns = ['movielens_id','title','genre']


    ratings = ratings.merge(movie_metadata, left_on='itemId', right_on='movielens_id', how = 'left')
    # display(ratings)
    
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)

elif data_name == 'books':
    ratings_file = '../data/books/ratings.csv'
    ratings = pd.read_csv(ratings_file)
    ratings.rename(columns={'book_id':'itemId','review/time':'timestamp','Title':'title','review/score':'rating','User_id':'userId','categories':'genres'}, inplace=True)

    valid_item_names = ratings.title.unique().tolist()
    #rename the bookId column to itemId
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)
elif data_name == 'goodbooks':
    ratings_file = '../data/goodbooks/ratings.csv'
    ratings = pd.read_csv(ratings_file)
    #remove cold start items
    ratings = ratings[ratings['book_id'].isin(ratings['book_id'].value_counts()[ratings['book_id'].value_counts()>10].index)]
    
    ratings.rename(columns={'book_id':'itemId','Title':'title','user_id':'userId'}, inplace=True)
    valid_item_names = ratings.title.unique().tolist()
    #rename the bookId column to itemId
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)
elif data_name == 'netflix':
    ratings_file = '../data/netflix/ratings.csv'
    ratings = pd.read_csv(ratings_file)
    ratings.rename(columns={'MovieID':'itemId','Title':'title','CustomerID':'userId','Name':'title','Rating':'rating','Date':'timestamp'}, inplace=True)
    print(f"{ratings.columns=}")
    valid_item_names = ratings.title.unique().tolist()
    #rename the bookId column to itemId
    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k)

    
valid_item_names = list(valid_item_names)
display(train_data)
train_data = train_data[train_data['title'].isin(valid_item_names)]


val_data = val_data[val_data['title'].isin(valid_item_names)]
test_data = test_data[test_data['title'].isin(valid_item_names)]

# Check for overlapping user-movie pairs again after filtering
train_user_movie_pairs = set(zip(train_data['userId'], train_data['itemId']))
val_user_movie_pairs = set(zip(val_data['userId'], val_data['itemId']))
test_user_movie_pairs = set(zip(test_data['userId'], test_data['itemId']))

overlap_train_val = train_user_movie_pairs.intersection(val_user_movie_pairs)
overlap_train_test = train_user_movie_pairs.intersection(test_user_movie_pairs)
overlap_val_test = val_user_movie_pairs.intersection(test_user_movie_pairs)
num_users_train = len(set(train_data['userId']))
num_users_val = len(set(val_data['userId']))
num_users_test = len(set(test_data['userId']))



### Error checking 
assert not overlap_train_val, f"Overlap between train and validation sets in rows:\n{train_data[train_data[['userId', 'itemId']].apply(tuple, axis=1).isin(overlap_train_val)]}"
assert not overlap_train_test, f"Overlap between train and test sets in rows:\n{train_data[train_data[['userId', 'itemId']].apply(tuple, axis=1).isin(overlap_train_test)]}"
assert not overlap_val_test, f"Overlap between validation and test sets in rows:\n{val_data[val_data[['userId', 'itemId']].apply(tuple, axis=1).isin(overlap_val_test)]}"

print("No overlap found after filtering by valid_item_names.")
assert num_users_test == num_users_train and num_users_test == num_users_val, f'{num_users_val=} {num_users_train=} {num_users_test=}'
print( 'The total number of users is ', len(set(ratings['userId'])))
print(f"All sets have the same number of users used = {num_users_train}" , f"The number of non users is {len(non_users)}")

#randomly remove 500 users from the training validation and test split and make a joint set of these called strong generalization set 
random.seed(42)
user_set = set(train_data['userId']) | set(val_data['userId']) | set(test_data['userId'])
user_set = random.sample(list(user_set) ,500)
strong_generalization_set = pd.concat([train_data[train_data['userId'].isin(user_set)],val_data[val_data['userId'].isin(user_set)],test_data[test_data['userId'].isin(user_set)]])

train_data = train_data[~train_data['userId'].isin(user_set)]
val_data = val_data[~val_data['userId'].isin(user_set)]
test_data = test_data[~test_data['userId'].isin(user_set)]


data_name='ml-1m'


Unnamed: 0,userId,itemId,rating,timestamp,movielens_id,title,genre
696446,4169,1548,3,1024176337,1548,"War at Home, The (1996)",Drama
695945,4169,1804,2,1024175783,1804,"Newton Boys, The (1998)",Crime|Drama
697358,4169,494,4,1024175760,494,Executive Decision (1996),Action|Thriller
695702,4169,1413,3,1024175031,1413,"Whole Wide World, The (1996)",Drama
697882,4169,3754,2,1024174347,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Animation|Children's|Comedy
...,...,...,...,...,...,...,...
491284,3021,913,3,970506679,913,"Maltese Falcon, The (1941)",Film-Noir|Mystery
491299,3021,32,2,970497581,32,Twelve Monkeys (1995),Drama|Sci-Fi
491288,3021,3246,3,970497509,3246,Malcolm X (1992),Drama
491291,3021,1127,2,970497509,1127,"Abyss, The (1989)",Action|Adventure|Sci-Fi|Thriller


No overlap found after filtering by valid_item_names.
The total number of users is  6040
All sets have the same number of users used = 6014 The number of non users is 26


In [551]:
# full_data_og = pd.concat([strong_og,train_data_og])
full_data_og = pd.concat([train_data_og,strong_og])[['userId','movieId']]
full_data = pd.concat([train_data,strong_generalization_set])[['userId','itemId']].rename(columns={'itemId':'movieId'})

#check if all the rows match by do it user wise then sort by movie id and make sure the rows match
full_data_og.sort_values('userId',inplace=True)
full_data.sort_values('userId',inplace=True)
for user in full_data.userId.unique():
    s = set(full_data[full_data.userId == user].movieId)
    s2 = set(full_data_og[full_data_og.userId == user].movieId)
    if len(s-s2) > 0 or len(s2-s) > 0:
        print(user)
        print(s-s2)
        print(s2-s)
        break

In [459]:
movie_metadata = pd.read_csv('../data/merged_asin_movielens_summary.csv')


In [470]:

movie_metadata = pd.read_csv(movie_metadata_file,encoding='ISO-8859-1',sep='::',header=None)
movie_metadata.columns = ['movielens_id','title','genre']


In [461]:
movie_metadata[movie_metadata.movielens_id == 1788]

Unnamed: 0,asin,imdb_id,movielens_id,name,imdb_url,title,genres,summary


In [479]:
train_data = train_data[['userId',	'movieId'	,'rating',	'timestamp',	'title',	'genre']].sort_values('userId')
train_data.groupby('userId').count()
full_data = pd.concat([strong_generalization_set,train_data])

In [None]:
train_data = train_data[['userId',	'movieId'	,'rating',	'timestamp',	'title',	'genres']].sort_values('userId')
print(f"{len(train_data[train_data.userId == 1])=}")
train_data[(train_data.userId == 1) & (~train_data.movieId.isin(movies_missing))]

len(train_data[train_data.userId == 1])=49


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
21,1,1270,5,978300055,Back to the Future (1985),Adventure|Comedy|Sci-Fi


In [480]:
valid_done = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/ml-1m/validation_leave_one_out_timestamped.csv')[['userId',	'movieId'	,'rating',	'timestamp',	'title',	'genres']].sort_values('userId')

In [483]:
valid_done

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
3781,1,1270,5,978300055,Back to the Future (1985),Comedy|Sci-Fi
2170,2,589,4,978299773,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
3886,3,1968,4,978297068,"Breakfast Club, The (1985)",Comedy|Drama
5353,4,3468,5,978294008,"Hustler, The (1961)",Drama
3293,6,1088,5,978236670,Dirty Dancing (1987),Musical|Romance
...,...,...,...,...,...,...
964,6035,1689,5,956713330,"Man Who Knew Too Little, The (1997)",Comedy|Mystery
1435,6037,1136,4,956719240,Monty Python and the Holy Grail (1974),Comedy
5439,6038,2146,4,956706909,St. Elmo's Fire (1985),Drama|Romance
2277,6039,918,5,956705907,Meet Me in St. Louis (1944),Musical


In [565]:
test_done = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/ml-1m/test_leave_one_out_.csv')[['userId',	'movieId'	,'rating',	'timestamp',	'title',	'genres']].sort_values('userId')

KeyError: "['movieId', 'genres'] not in index"

In [484]:
val = val_data[['userId',	'movieId'	,'rating',	'timestamp',	'title',	'genre']].sort_values('userId')

In [596]:
train_data_og = pd.read_csv(f'../data_preprocessed/netflix copy/train_leave_one_out_.csv')
strong_og = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/netflix copy/strong_generalization_set_.csv')




In [597]:
train_data = pd.read_csv(f'../data_preprocessed/netflix/train_leave_one_out_.csv')
strong_generalization_set = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/netflix/strong_generalization_set_.csv')

full_data_og = pd.concat([train_data_og,strong_og])[['userId','movieId']]
print(f"{len(full_data_og)=}")
full_data = pd.concat([train_data,strong_generalization_set])[['userId','itemId']].rename(columns={'itemId':'movieId'})
print(f"{len(full_data)=}")



len(full_data_og)=2979548
len(full_data)=2979548


In [598]:
full_data_og.groupby('userId').count()

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
383,319
592,176
602,332
609,212
735,352
...,...
2649020,125
2649285,730
2649288,171
2649375,127


In [599]:
full_data.groupby('userId').count()

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
383,319
592,176
602,332
609,212
735,352
...,...
2649020,125
2649285,730
2649288,171
2649375,127


In [562]:
# full_data_og = pd.concat([strong_og,train_data_og])
full_data_og = pd.concat([train_data_og,strong_og])[['userId','movieId']]
full_data = pd.concat([train_data,strong_generalization_set])[['userId','itemId']].rename(columns={'itemId':'movieId'})

#check if all the rows match by do it user wise then sort by movie id and make sure the rows match
full_data_og.sort_values('userId',inplace=True)
full_data.sort_values('userId',inplace=True)
for user in full_data.userId.unique():
    s = set(full_data[full_data.userId == user].movieId)
    s2 = set(full_data_og[full_data_og.userId == user].movieId)
    if len(s-s2) > 0 or len(s2-s) > 0:
        print(user)
        print(s-s2)
        print(s2-s)
        break

383
{12074, 12875}
{2452, 2372}


In [178]:
subset_gen = strong_generalization_set[['userId',	'movieId'	,'rating',	'timestamp',	'title',	'genres']].sort_values('userId')
subset_gen.groupby('userId').count().min()

movieId      20
rating       20
timestamp    20
title        20
genres       20
dtype: int64

In [153]:
strong_og.sort_values('userId',inplace=True)
strong_og.groupby('userId').count().min()

movieId      20
rating       20
timestamp    20
title        20
genres       20
dtype: int64

In [142]:
strong_og

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
47839,5,47,3,978245334,Seven (Se7en) (1995),Crime|Thriller
47930,5,913,5,978242740,"Maltese Falcon, The (1941)",Film-Noir|Mystery
47929,5,2723,4,978242788,Mystery Men (1999),Action|Adventure|Comedy
47928,5,2734,2,978242788,"Mosquito Coast, The (1986)",Drama
47927,5,968,3,978242847,Night of the Living Dead (1968),Horror|Sci-Fi
...,...,...,...,...,...,...
57062,6021,471,3,956756310,"Hudsucker Proxy, The (1994)",Comedy|Romance
75387,6021,364,4,956757147,"Lion King, The (1994)",Animation|Children's|Musical
57075,6021,151,4,956756124,Rob Roy (1995),Drama|Romance|War
56969,6021,1573,4,956757546,Face/Off (1997),Action|Sci-Fi|Thriller


In [508]:
full_data

Unnamed: 0,userId,movieId
32,1,1566
25,1,48
34,1,1907
30,1,2294
4,1,2355
...,...,...
92,6040,1198
51,6040,3424
179,6040,1231
83,6040,1173


In [512]:
pd.concat([train_data,strong_generalization_set])

Unnamed: 0,userId,itemId,rating,timestamp,movielens_id,title,genre
695642,4169,3789,5,965333672,3789,"Pawnbroker, The (1965)",Drama
695643,4169,571,4,973310265,571,"Wedding Gift, The (1994)",Drama
695644,4169,574,3,975805232,574,Spanking the Monkey (1994),Comedy|Drama
695645,4169,575,3,976589949,575,"Little Rascals, The (1994)",Children's|Comedy
695646,4169,577,3,988324145,577,Andre (1994),Adventure|Children's
...,...,...,...,...,...,...,...
730119,4365,527,4,965183430,527,Schindler's List (1993),Drama|War
830660,4991,1082,4,962591895,1082,"Candidate, The (1972)",Drama
762002,4525,1090,5,964808458,1090,Platoon (1986),Drama|War
21845,160,3916,5,1016380849,3916,Remember the Titans (2000),Drama


1
{1721, 1270}
{608, 3114}


In [473]:
s_og_movies = set(strong_og[strong_og.userId == 5].movieId)

In [474]:
s_new = set(strong_generalization_set[strong_generalization_set.userId == 5].movieId)

In [420]:
pd.set_option('display.max_rows', 500)

ratings[ratings.userId == 5].sort_values('timestamp', ascending=False)


Unnamed: 0,userId,movieId,rating,timestamp
258,5,288,2,978246585
324,5,1485,3,978246576
372,5,3105,2,978246576
418,5,1884,3,978246576
389,5,2029,4,978246555
296,5,229,3,978246528
391,5,551,4,978246504
441,5,353,2,978246504
429,5,1527,3,978246479
304,5,52,2,978246479


In [440]:
strong_generalization_set[(strong_generalization_set.userId ==5) & (strong_generalization_set.movieId ==1788)][['movieId','rating']]

Unnamed: 0,movieId,rating


In [442]:
strong_og[(strong_og.userId ==5) & (strong_og.movieId ==1788)]


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
47874,5,1788,3,978244603,Men With Guns (1997),Action|Drama


In [475]:
s_og_movies - s_new

set()

In [430]:
s_og = set(full_data_og.userId.unique())
print(f"{len(s_og)=}")

len(s_og)=500


In [431]:
#SUBSET STRONG_GENREATLIZATION FOR NON NAN ROWS
s = set(full_data[full_data['title'].notna()].userId.unique())
print(f"{len(s)=}")


len(s)=500


In [408]:

print(f"{len((s_og - s))=}")
(s_og - s), (s - s_og)

len((s_og - s))=0


(set(), set())

In [612]:
full_og = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/ml-1m copy/dataset_full.csv')
full = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/ml-1m/dataset_full.csv')

In [613]:
full_og

Unnamed: 0,uid,sid,rating
0,0,801.0,5
1,0,485.0,3
2,0,617.0,3
3,0,2398.0,4
4,0,1628.0,5
...,...,...,...
952041,5997,83.0,4
952042,5997,103.0,4
952043,5997,710.0,5
952044,5997,1359.0,5


In [614]:
full

Unnamed: 0,uid,sid,rating
0,0,801.0,5
1,0,485.0,3
2,0,617.0,3
3,0,2398.0,4
4,0,1628.0,5
...,...,...,...
952041,5997,83.0,4
952042,5997,103.0,4
952043,5997,710.0,5
952044,5997,1359.0,5
