In [None]:
import pandas as pd
import numpy as np
import re

train = pd.read_csv('ML-100K_train_original.csv')

interaction_counts = train.groupby('item_id:token')['timestamp:float'].count().reset_index()
train = pd.merge(train, interaction_counts, on='item_id:token', how='left')
train.rename(columns={'timestamp:float_x': 'timestamp:float'}, inplace=True)
train.rename(columns={'timestamp:float_y': 'interaction_count'}, inplace=True)

train['title_genre'] = '<' + train['movie_title:token_seq'] + ' (genre: ' + train['class:token_seq'] + ')>'
train['<movie_title:token_seq>'] = '<' + train['movie_title:token_seq'] + '>'

train = train.sort_values(by=['user_id:token', 'interaction_count'], ascending=[True, False])

np.set_printoptions(linewidth=np.inf)
np.random.seed(2024)

user_ids = np.unique(train['user_id:token'].values)
user_dict = dict()
rating_count = dict()

for user_id in user_ids:

    df_user = train[train['user_id:token'] == user_id]

    pos_5 = df_user[df_user['rating:float'] == 5]
    pos_4 = df_user[df_user['rating:float'] == 4]
    neg_3 = df_user[df_user['rating:float'] == 3]
    neg_2 = df_user[df_user['rating:float'] == 2]
    neg_1 = df_user[df_user['rating:float'] == 1]

    values_pos_5 = [pos_5['title_genre'].values]
    values_pos_4 = [pos_4['title_genre'].values]
    values_neg_3 = [neg_3['title_genre'].values]
    values_neg_2 = [neg_2['title_genre'].values]
    values_neg_1 = [neg_1['title_genre'].values]

    mean_rating = np.mean(df_user['rating:float'].values)

    values = [values_pos_5, values_pos_4, values_neg_3, values_neg_2, values_neg_1, mean_rating]
    user_dict[user_id] = values


train.head()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "4"

llm_dir = '/home/chwchong/_WWW25/LLM/'
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", cache_dir=llm_dir)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", cache_dir=llm_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
prompt1 = """
This dataset is from the MovieLens-100K dataset.

User's Positively Rated Movie List:
The following list includes movie titles and genres that the user has rated positively:
{positive_movies_5}

---------------------------------------------

Based on the User's Positively Rated Movie List, analyze the user's preferences and patterns.
You will be provided with Candidate List, which includes movie titles and genres that the user has rated both positively and negatively.
Your task is to strictly select movies and provide only the movie titles from Candidate List that the user is most likely to have rated positively.

Candidate List:
{candidate_example}

Output (Answer):
{only_title_4}

---------------------------------------------

Based on the User's Positively Rated Movie List, analyze the user's preferences and patterns.
You will be provided with Candidate List, which includes movie titles and genres that the user has rated both positively and negatively.
Your task is to strictly select movies and provide only the movie titles from Candidate List that the user is most likely to have rated positively.

Candidate List:
{negative_movies_32}

Output (Answer):
"""



def ask_llama1(question, tokenizer, model, device, stop_token="---------------------------------------------", max_occurrences=2):
    inputs = tokenizer(question, return_tensors="pt").to(device)
    input_ids = inputs['input_ids']
    start_index = question.find("Candidate List:")
    if start_index != -1:
        second_start_index = question.find("Candidate List:", start_index + 1)
    else:
        second_start_index = -1
    if second_start_index != -1:
        candidate_list_2_text = question[second_start_index:]
    else:
        print('"Second occurrence of "Candidate List:" cannot be found.')
    inputs_sub = tokenizer(candidate_list_2_text, return_tensors="pt").to(device)
    input_ids_sub = inputs_sub['input_ids']
    length = input_ids.shape[1] + input_ids_sub.shape[1]
    if length > 5000:
        print(f"Skipping user due to input length: {input_ids.shape[1]}", end=", ")
        return None
    else:
        print(f"input length: {input_ids.shape[1]}", end=", ")
    outputs = model.generate(
        input_ids=input_ids.to(device),
        attention_mask=inputs['attention_mask'].to(device),
        max_length=length,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        num_beams=1,
        do_sample=False,
        temperature=1,
        top_p=1
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    stop_token_count = 0
    output_lines = []
    for line in response.split('\n'):
        if stop_token in line:
            stop_token_count += 1
            if stop_token_count > max_occurrences:
                break
        output_lines.append(line)
    return '\n'.join(output_lines).strip()

In [None]:
prompt2 = """
This dataset is from the MovieLens-100K dataset.
Based on the User's Positively Rated Movie List, analyze the user's preferences and patterns.
You will be provided with Candidate List, which includes movie titles and genres that the user has rated both positively and negatively.
Your task is to strictly select movies and provide only the movie titles from Candidate List that the user is most likely to have rated positively.

User's Positively Rated Movie List:
{positive_movies_5}

Candidate list:
{candidate_example}

Output (Answer):
{only_title_4}

---------------------------------------------

This dataset is from the MovieLens-100K dataset.
Based on the User's Positively Rated Movie List, analyze the user's preferences and patterns.
You will be provided with Candidate List, which includes movie titles and genres that the user has rated both positively and negatively.
Your task is to strictly select movies and provide only the movie titles from Candidate List that the user is most likely to have rated positively.

User's Positively Rated Movie List:
{positive_movies_5}

Candidate list:
{negative_movies_32}

Output (Answer):
"""


def ask_llama2(question, tokenizer, model, device, stop_token="---------------------------------------------", max_occurrences=1):
    inputs = tokenizer(question, return_tensors="pt").to(device)
    input_ids = inputs['input_ids']
    start_index = question.find("Candidate List:")
    if start_index != -1:
        second_start_index = question.find("Candidate List:", start_index + 1)
    else:
        second_start_index = -1
    if second_start_index != -1:
        candidate_list_2_text = question[second_start_index:]
    else:
        print('"Second occurrence of "Candidate List:" cannot be found.')
    inputs_sub = tokenizer(candidate_list_2_text, return_tensors="pt").to(device)
    input_ids_sub = inputs_sub['input_ids']
    length = input_ids.shape[1] + input_ids_sub.shape[1]
    if length > 5000:
        print(f"Skipping user due to input length: {input_ids.shape[1]}", end=", ")
        return None
    else:
        print(f"input length: {input_ids.shape[1]}", end=", ")
    outputs = model.generate(
        input_ids=input_ids.to(device),
        attention_mask=inputs['attention_mask'].to(device),
        max_length=length,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        num_beams=1,
        do_sample=False,
        temperature=1,
        top_p=1
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    stop_token_count = 0
    output_lines = []
    for line in response.split('\n'):
        if stop_token in line:
            stop_token_count += 1
            if stop_token_count > max_occurrences:
                break
        output_lines.append(line)
    return '\n'.join(output_lines).strip()

In [None]:
# skip: Pre-removed users who have hallucinations because of too many interactions
skip = np.array([1, 6, 7, 10, 13, 18, 59, 85, 90, 92, 94, 95, 102, 109, 130, 145, 151, 178, 181, 184, 194, 200, 201, 222, 234, 262, 268, 269, 271, 276, 279, 286, 291, 293, 299, 301, 303, 308, 311, 313, 327, 328, 334, 343, 347, 354, 363, 373, 378, 380, 385, 387, 389, 393, 399, 405, 406, 407, 416, 417, 429, 435, 437, 449, 450, 454, 456, 474, 483, 497, 500, 504, 524, 533, 537, 540, 551, 561, 592, 606, 642, 648, 650, 653, 655, 660, 682, 707, 711, 716, 727, 747, 749, 750, 758, 760, 788, 796, 804, 805, 833, 840, 843, 846, 854, 864, 870, 880, 881, 883, 889, 892, 896, 916, 919, 925])
user_ids = np.setdiff1d(np.array(list(user_dict.keys())), skip)
len(user_ids), user_ids

In [None]:
skipped_users = []

with open('llama_distinguish_answer.txt', 'w') as f_cut, open('llama_distinguish_full.txt', 'w') as f_full:
    for user_id in user_ids:
        mean_rating = user_dict[user_id][5]
        positive_movies_5 = user_dict[user_id][0][0]
        positive_movies_4 = user_dict[user_id][1][0]
        negative_movies_3 = user_dict[user_id][2][0]
        negative_movies_2 = user_dict[user_id][3][0]
        negative_movies_1 = user_dict[user_id][4][0]

        print('mean_rating %.2f' %mean_rating, end=" ")
        if mean_rating >= 3:
            candidate_example = np.array(list(positive_movies_4) + list(negative_movies_2) + list(negative_movies_1))
            candidate_real = negative_movies_3
        else:
            candidate_example = np.array(list(positive_movies_4) + list(negative_movies_1))
            candidate_real = np.array(list(negative_movies_3) + list(negative_movies_2))
        np.random.shuffle(candidate_example)
        np.random.shuffle(candidate_real)

        only_title_4 = candidate_example[np.isin(candidate_example, positive_movies_4)]
        if len(only_title_4) > 0:
            only_title_4 = np.vectorize(lambda item: re.sub(r'\s*\(genre:.*$', '>', item))(only_title_4).astype(object)
        

        if len(candidate_real) > 5:
            chunked_candidate_real = [candidate_real[i:i+5] for i in range(0, len(candidate_real), 5)]
        else:
            chunked_candidate_real = [candidate_real]

        for i, chunk in enumerate(chunked_candidate_real):
            prompt = prompt1.format(
                positive_movies_5=positive_movies_5,
                candidate_example=candidate_example,
                only_title_4=only_title_4,
                negative_movies_32=chunk)
            response = ask_llama1(prompt, tokenizer, model, device)

            if response is None:
                print('user ' + str(user_id) + '[' + str(i) + '] skip')
                skipped_users.append(user_id)
                continue
            if response[-1] != ']':
                print('user ' + str(user_id) + '[' + str(i) + '] skip (hallucination)')
                prompt = prompt2.format(
                positive_movies_5=positive_movies_5,
                candidate_example=candidate_example,
                only_title_4=only_title_4,
                negative_movies_32=chunk)
                response = ask_llama2(prompt, tokenizer, model, device)
                if response is None:
                    print('user ' + str(user_id) + '[' + str(i) + '] skip')
                    skipped_users.append(user_id)
                    continue
                if response[-1] != ']':
                    print('user ' + str(user_id) + '[' + str(i) + '] skip (hallucination)')
                    skipped_users.append(user_id)
                    continue

            print('user ' + str(user_id) + '[' + str(i) + '] complete!')
            user_number_full = f"LLaMA's full recommendation for user {user_id}:"
            f_full.write(user_number_full + '\n')
            f_full.write(response + '\n\n\n\n\n\n\n\n\n')
            user_number_cut = f"LLaMA's cut recommendation for user {user_id}:" 
            f_cut.write(user_number_cut + '\n')
            f_cut.write(response[len(prompt)-1:] + '\n\n\n\n\n\n')


if skipped_users:
    skipped_users = np.unique(np.array(skipped_users))
    print(f"Skipped users due to input length: {skipped_users}")
    print(f"len(skipped_users): {len(skipped_users)}")
#381m 38.8s