In [42]:
import pandas as pd
import json
import random
import os
from collections import OrderedDict
import  debugpy
#import display 
from IPython.display import display
import argparse 
#silence warnings
import warnings
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser(description='Generate train, validation, and test splits for MovieLens dataset.')
parser.add_argument('--data_name', type=str, choices=['ml-1m', 'ml-100k'], default='ml-1m',
                        help='Name of the MovieLens dataset (ml-1m or ml-100k). Default is ml-1m.')
parser.add_argument('--timestamp', action='store_true')
args = parser.parse_args([])
args.timestamp = True
data_name = args.data_name

def save_sorted_json(data, filename):
    sorted_data = OrderedDict(sorted(data.items(), key=lambda t: t[0]))
    with open(filename, 'w') as f:
        json.dump(sorted_data, f, indent=4)


def extract_unique_movies(user_data):
    movies = set()
    for user_movies in user_data.values():
        for genre_movies in user_movies.values():
            if isinstance(genre_movies, list):  # normal train, valid, test set
                for movie in genre_movies:
                    movies.add(movie['title'])
            else:  # leave-one-out valid and test set
                movies.add(genre_movies['title'])
    return movies


def extract_user_genres(data):
    """Extract genres a user has interacted with."""
    user_genres = {}

    for user_id, genres in data.items():
        user_genres[user_id] = list(genres.keys())

    return user_genres

def load_movie_titles_from_dat(file_path):
    with open(file_path, 'r') as f:
        lines = f.read().decode('latin-1')
    movie_titles = set()
    for line in lines:
        tokens = line.split("::")
        title = tokens[1]
        movie_titles.add(title)
    return movie_titles

def filter_movie_titles_by_valid_set(movies, valid_movie_titles):
    return [movie for movie in movies if movie['title'] in valid_movie_titles]

def sample_random(user_movies):
    validation_movie = user_movies.sample(n=1)

    user_movies = user_movies.drop(validation_movie.index)
    

    # Randomly choose one movie for test and remove it from user's ratings
    test_movie = user_movies.sample(n=1)
    user_movies = user_movies.drop(test_movie.index)
    return validation_movie,test_movie,user_movies

def sample_most_recent(user_movies):
    # Sort the DataFrame by the 'timestamp' column in descending order
    user_movies = user_movies.sort_values(by='timestamp', ascending=False)

    # Sample the two most recent movies
    most_recent_movies = user_movies.head(2)

    # Drop the two most recent movies from the user's ratings
    user_movies = user_movies.drop(most_recent_movies.index)

    # Split the two most recent movies into validation_movie and test_movie
    validation_movie = most_recent_movies.head(1)
    test_movie = most_recent_movies.tail(1)

    return validation_movie, test_movie, user_movies

def split_and_filter_ratings(user_movies, rating_threshold=4):
    # Sort the DataFrame by the 'timestamp' column in descending order and select the first 52 movies
    user_movies = user_movies.sort_values(by='timestamp', ascending=False)
    if user_movies.shape[0] < 52:
        #take the last two movies that are rated higher than the threshold
        validation_test_movies = user_movies[user_movies['rating'] >= rating_threshold].tail(2)
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        #use the rest for train and prompt set
        training_set = prompt_movies= user_movies.drop(validation_test_movies.index)

    else:
        candidate_recent_movies = user_movies.head(52)
        validation_test_movies = candidate_recent_movies[candidate_recent_movies['rating'] >= rating_threshold].tail(2)
        
        # Select the first two movies from the high-rated movies as validation and test sets

        # If there are no high-rated movies or validation_candidatess_recent_movies is empty, set validation and test sets to None

        # Split the selected movies into validation and test sets
        validation_set = validation_test_movies.head(1)
        test_set = validation_test_movies.tail(1)
        training_set = user_movies.drop(validation_test_movies.index)
        prompt_movies = training_set.head(50)
    if validation_test_movies.shape[0] < 2:
        return None,None,None,None
    return prompt_movies, validation_set, test_set,training_set

def generate_train_val_test_splits(ratings, k, movie_metadata):
    # Count the number of ratings for each user
    user_counts = ratings['userId'].value_counts()

    # Filter out users with fewer than k ratings
    eligible_users = user_counts[user_counts > k].index
  

    # Initialize empty lists for training, validation, and test data
    train_data = []
    val_data = []
    test_data = []
    prompt_set =[]

    # Iterate through eligible users
    non_users = []
    for user_id in eligible_users:
        # Extract user's ratings
        user_movies = ratings[ratings['userId'] == user_id]
        user_movies = user_movies[user_movies['title'].isin(valid_movie_titles)]
     

        # Randomly choose one movie for validation and remove it from user's ratings
        prompt_movies,validation_movie, test_movie,training_set  =split_and_filter_ratings(user_movies)
        if prompt_movies is None:
            non_users.append(user_id)
            continue

        # Append user's training data
        prompt_set.append(prompt_movies)
        train_data.append(training_set)

        # Append user's validation data
        val_data.append(validation_movie)

        # Append user's test data
        test_data.append(test_movie)
        
        
 
    # Concatenate the dataframes to get the final splits
    train_data = pd.concat(train_data)
    val_data = pd.concat(val_data)
    test_data = pd.concat(test_data)
    promp_set = pd.concat(prompt_set)

    return train_data, val_data, test_data,promp_set,non_users



if __name__ == "__main__":
        

    movie_titles = pd.read_csv(f'../data/ml-1m/movies.dat',encoding='ISO-8859-1',sep='::',header=None  )
    movie_titles.columns = [ 'id', 'title','genres']

    valid_movie_titles = pd.read_csv(f'../data/ml-1m/movies.dat',encoding='ISO-8859-1',sep='::',header=None).iloc[:,1].tolist()
    valid_movie_ids = pd.read_csv(f'../data/ml-1m/movies.dat',encoding='ISO-8859-1',sep='::',header=None).iloc[:,0].tolist()


    if data_name == 'ml-1m':
        ratings_file = '../data/ml-1m/ratings.dat'
        separator = "::"
        header = None
        rating_columns = ['userId', 'movieId', 'rating', 'timestamp']
        movie_metadata_file = '../data/ml-1m/movies.dat'
    elif data_name == 'ml-100k':
        ratings_file = '../data/ml-100k/u.data'
        separator = "\t"
        header = None
        rating_columns = ['userId', 'movieId', 'rating', 'timestamp']
        movie_metadata_file = '../data/ml-100k/u.item'


    ratings = pd.read_csv(ratings_file, sep=separator, header=header, encoding='ISO-8859-1')
    #merge ratings right on the movie_titles 
    ratings.columns = ['userId', 'movieId', 'rating', 'timestamp']
    ratings = ratings.merge(movie_titles, left_on='movieId', right_on='id', how='left').drop(columns=['id'])


    movies_set = set(ratings['movieId'])

    # Load movie metadata
    movie_metadata = pd.read_csv('../data/merged_asin_movielens_summary.csv')
    metadata_movies = set(movie_metadata['movielens_id'])

    diff = set(valid_movie_ids) - set(metadata_movies)

    k = 10  # threshold for history length

    train_data, val_data, test_data,promp_set,non_users = generate_train_val_test_splits(ratings, k, movie_metadata)

    valid_movie_titles = list(valid_movie_titles)
    train_data = train_data[train_data['title'].isin(valid_movie_titles)]


    val_data = val_data[val_data['title'].isin(valid_movie_titles)]
    test_data = test_data[test_data['title'].isin(valid_movie_titles)]

    # Check for overlapping user-movie pairs again after filtering
    train_user_movie_pairs = set(zip(train_data['userId'], train_data['movieId']))
    val_user_movie_pairs = set(zip(val_data['userId'], val_data['movieId']))
    test_user_movie_pairs = set(zip(test_data['userId'], test_data['movieId']))

    overlap_train_val = train_user_movie_pairs.intersection(val_user_movie_pairs)
    overlap_train_test = train_user_movie_pairs.intersection(test_user_movie_pairs)
    overlap_val_test = val_user_movie_pairs.intersection(test_user_movie_pairs)
    num_users_train = len(set(train_data['userId']))
    num_users_val = len(set(val_data['userId']))
    num_users_test = len(set(test_data['userId']))


    
    ### Error checking 
    assert not overlap_train_val, f"Overlap between train and validation sets in rows:\n{train_data[train_data[['userId', 'movieId']].apply(tuple, axis=1).isin(overlap_train_val)]}"
    assert not overlap_train_test, f"Overlap between train and test sets in rows:\n{train_data[train_data[['userId', 'movieId']].apply(tuple, axis=1).isin(overlap_train_test)]}"
    assert not overlap_val_test, f"Overlap between validation and test sets in rows:\n{val_data[val_data[['userId', 'movieId']].apply(tuple, axis=1).isin(overlap_val_test)]}"

    print("No overlap found after filtering by valid_movie_titles.")
    assert num_users_test == num_users_train and num_users_test == num_users_val, f'{num_users_val=} {num_users_train=} {num_users_test=}'
    print( 'The total number of users is ', len(set(ratings['userId'])))
    print(f"All sets have the same number of users used = {num_users_train}" , f"The number of non users is {len(non_users)}")
    
    #randomly remove 500 users from the training validation and test split and make a joint set of these called strong generalization set 
    random.seed(42)
    user_set = set(train_data['userId']) | set(val_data['userId']) | set(test_data['userId'])
    user_set = random.sample(list(user_set) ,500)
    strong_generalization_set = pd.concat([train_data[train_data['userId'].isin(user_set)],val_data[val_data['userId'].isin(user_set)],test_data[test_data['userId'].isin(user_set)]])


    train_data = train_data[~train_data['userId'].isin(user_set)]
    val_data = val_data[~val_data['userId'].isin(user_set)]
    test_data = test_data[~test_data['userId'].isin(user_set)]

    
    c = 0 
    for user_id in ratings.userId.unique():
        if user_id in train_data.userId.unique():
            #compare set of movie ids seen 
            train_movies = set(train_data[train_data.userId == user_id].movieId)
            ratings_movies = set(ratings[ratings.userId == user_id].movieId)


            diff = ratings_movies - train_movies
            if len(diff) > 2:
                c+=1 

                #print difference 
                print(diff)
                print('user_id')



    
    # #save data
    train_data.to_csv(f'../data_preprocessed/{data_name}/train_leave_one_out_{("timestamped" if args.timestamp else "")}.csv', index=False)
    val_data.to_csv(f'../data_preprocessed/{data_name}/validation_leave_one_out_{"timestamped" if args.timestamp else ""}.csv',index=False)
    test_data.to_csv(f'../data_preprocessed/{data_name}/test_leave_one_out_{"timestamped" if args.timestamp else ""}.csv', index=False)
    promp_set.to_csv(f'../data_preprocessed/{data_name}/prompt_set_{"timestamped" if args.timestamp else ""}.csv', index=False)
    strong_generalization_set.to_csv(f'../data_preprocessed/{data_name}/strong_generalization_set_{"timestamped" if args.timestamp else ""}.csv', index=False)
    movie_set = set(train_data['movieId']) 
    max_movie_id = max(movie_set)



    # Print a message






No overlap found after filtering by valid_movie_titles.
The total number of users is  6040
All sets have the same number of users used = 6014 The number of non users is 26


In [41]:
pd.concat([train_data[train_data['userId'].isin(user_set)],val_data[val_data['userId'].isin(user_set)],test_data[test_data['userId'].isin(user_set)]])



Unnamed: 0,userId,movieId,rating,timestamp,title,genres
137720,889,3047,5,975364518,Experience Preferred... But Not Essential (1982),Drama
138794,889,3713,4,975364518,"Long Walk Home, The (1990)",Drama
138292,889,1328,1,975364486,"Amityville Curse, The (1990)",Horror
139033,889,2974,2,975364486,Bats (1999),Horror|Thriller
138285,889,1322,1,975364486,Amityville 1992: It's About Time (1992),Horror
...,...,...,...,...,...,...
978916,5904,356,4,957380525,Forrest Gump (1994),Comedy|Romance|War
625963,3787,1230,4,966019703,Annie Hall (1977),Comedy|Romance
87546,572,2006,5,975996296,"Mask of Zorro, The (1998)",Action|Adventure|Romance
878664,5309,50,4,960928290,"Usual Suspects, The (1995)",Crime|Thriller


In [34]:
test_data[test_data['userId'].isin(user_set)]


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
138094,889,2826,4,975364312,"13th Warrior, The (1999)",Action|Horror|Thriller
961221,5795,2176,5,958152062,Rope (1948),Thriller
764742,4543,3783,4,1005626915,Croupier (1998),Crime|Drama
744811,4448,3499,5,991244509,Misery (1990),Horror
72478,482,1245,4,1002083722,Miller's Crossing (1990),Drama
...,...,...,...,...,...,...
978916,5904,356,4,957380525,Forrest Gump (1994),Comedy|Romance|War
625963,3787,1230,4,966019703,Annie Hall (1977),Comedy|Romance
87546,572,2006,5,975996296,"Mask of Zorro, The (1998)",Action|Adventure|Romance
878664,5309,50,4,960928290,"Usual Suspects, The (1995)",Crime|Thriller


In [19]:
strong_generalization_set

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,0
697098,,,,,,,False
278714,,,,,,,False
714062,,,,,,,False
328755,,,,,,,False
191014,,,,,,,False
...,...,...,...,...,...,...,...
948139,,,,,,,False
553433,,,,,,,False
274854,,,,,,,False
739640,,,,,,,False


In [28]:
movies_ids = pd.read_csv(f'../data/ml-1m/movies.dat',encoding='ISO-8859-1',sep='::',header=None)
print(f"{movies_ids=}")

movies_ids=         0                                   1                             2
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
...    ...                                 ...                           ...
3878  3948             Meet the Parents (2000)                        Comedy
3879  3949          Requiem for a Dream (2000)                         Drama
3880  3950                    Tigerland (2000)                         Drama
3881  3951             Two Family House (2000)                         Drama
3882  3952               Contender, The (2000)                Drama|Thriller

[3883 rows x 3 columns]


In [27]:
movie_metadata

Unnamed: 0,asin,imdb_id,movielens_id,name,imdb_url,title,genres,summary
0,6304089767,114709.0,1.0,Toy Story,https://www.imdb.com/title/tt0114709/,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story is an American media franchise owned...
1,767814827,113497.0,2.0,Jumanji,https://www.imdb.com/title/tt0113497/,Jumanji (1995),Adventure|Children|Fantasy,Jumanji is a 1995 American fantasy adventure f...
2,790727382,113228.0,3.0,Grumpier Old Men,https://www.imdb.com/title/tt0113228/,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men is a 1995 American romantic c...
3,6304016859,114885.0,4.0,Waiting to Exhale,https://www.imdb.com/title/tt0114885/,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale is a 1995 American romance f...
4,6304039174,113041.0,5.0,Father of the Bride Part II,https://www.imdb.com/title/tt0113041/,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II is a 1995 American...
...,...,...,...,...,...,...,...,...
20220,B000S6LP50,1114698.0,131130.0,Tom and Jerry: A Nutcracker Tale,https://www.imdb.com/title/tt1114698/,Tom and Jerry: A Nutcracker Tale (2007),Animation|Comedy,Tom and Jerry: A Nutcracker Tale is a 2007 Ame...
20221,B00005Y1O8,258827.0,131136.0,"Mädchen, Mädchen",https://www.imdb.com/title/tt0258827/,Girls on Top (2001),Comedy,
20222,B002SXKR82,780568.0,131138.0,My Führer – The Really Truest Truth about Adol...,https://www.imdb.com/title/tt0780568/,My Führer (2007),Comedy|Drama|War,My Führer – The Really Truest Truth about Adol...
20223,6302148774,59171.0,131152.0,The Fat Spy,https://www.imdb.com/title/tt0059171/,The Fat Spy (1966),Comedy,The Fat Spy is a 1966 Z movie that attempts to...


In [1]:
import pandas as pd 

In [3]:
df = pd.read_csv('/home/mila/e/emiliano.penaloza/LLM4REC/data_preprocessed/ml-1m/strong_generalization_set_timestamped.csv')

In [5]:
df.groupby('userId').count().min()

movieId      20
rating       20
timestamp    20
title        20
genres       20
dtype: int64

In [10]:
import json
with open('/home/mila/e/emiliano.penaloza/LLM4REC/saved_user_summary/ml-1m/user_summary_gpt4_new.json') as f: 
    data = json.load(f)
    data = {int(k):v for k,v in data.items()}

In [12]:
item_keys = list(data[0].keys())

In [26]:
with open('/home/mila/e/emiliano.penaloza/LLM4REC/saved_user_summary/ml-1m/user_summary_gpt4_.json') as f: 
    prompts = json.load(f)
    prompts = {int(float(k)):v for k,v in prompts.items()}

In [45]:
import nltk
import numpy as np
from tqdm import tqdm
#flatten prompts inta a list 
prompts_list = [v for k,v in prompts.items()]

#sample 25% of the data 
prompts_list = np.random.choice(prompts_list, int(len(prompts_list)*0.25))
#we are going to compute minimal edits between a pair of words to make them equal, as well as the BLUE score between them 
# only calculate it once for each pair of sentences in the prompt set 
score_array = np.zeros((len(prompts_list),len(prompts_list)))
edit_array = np.zeros((len(prompts_list),len(prompts_list)))
for i in tqdm(range(len(prompts_list))):
    for j in tqdm(range(i+1,len(prompts_list))):
        if score_array[i,j] != 0 or score_array[j,i] != 0:
            continue
        #compute blue score 
        blue_score = nltk.translate.bleu_score.sentence_bleu([prompts_list[i].split()], prompts_list[j].split())
        #compute minimal edits 
        minimal_edits = nltk.edit_distance(prompts_list[i].split(), prompts_list[j].split())
        score_array[i,j] = blue_score
        edit_array[i,j] = minimal_edits
        

  0%|          | 0/1504 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 1503/1503 [00:39<00:00, 38.12it/s]
100%|██████████| 1502/1502 [00:36<00:00, 41.25it/s]
100%|██████████| 1501/1501 [00:44<00:00, 33.86it/s]
100%|██████████| 1500/1500 [00:36<00:00, 41.34it/s]
100%|██████████| 1499/1499 [00:45<00:00, 32.95it/s]
100%|██████████| 1498/1498 [00:42<00:00, 35.21it/s]
100%|██████████| 1497/1497 [00:59<00:00, 25.14it/s]
100%|██████████| 1496/1496 [00:45<00:00, 32.89it/s]
100%|██████████| 1495/1495 [00:41<00:00, 36.36it/s]
100%|██████████| 1494/1494 [00:31<00:00, 47.26it/s]
100%|██████████| 1493/1493 [00

In [50]:
edit_array.mean()

86.21251927484155

In [51]:
score_array.mean()

0.020882050469318833

In [46]:
#save the scores and edit arrays 
np.save('/home/mila/e/emiliano.penaloza/LLM4REC/results/edit_array.npy', edit_array)
np.save('/home/mila/e/emiliano.penaloza/LLM4REC/results/score_array.npy', score_array)