In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('Netflix-1M_train_original.csv')
train

In [None]:
original = np.unique(train['rating:float'].values, return_counts=True)
original

In [None]:
import re
from collections import defaultdict

def clean_title(title):
    cleaned_title = re.sub(r'\s\(genre:.*\)', '', title).strip()
    return cleaned_title

def parse_recommendations(file_path):
    neg_to_pos = defaultdict(list)
    with open(file_path, 'r') as file:
        current_user = None
        for line in file:
            line = line.strip()
            if line.startswith("LLaMA's cut recommendation for user"):
                current_user = int(re.search(r'user (\d+)', line).group(1))
            elif line.startswith('[') and current_user is not None:
                movies = re.findall(r'<([^>]+)>', line)
                cleaned_movies = [clean_title(movie) for movie in movies]
                neg_to_pos[current_user].extend(cleaned_movies)
    for user, movies in neg_to_pos.items():
        neg_to_pos[user] = list(set(movies))
    return dict(neg_to_pos)

file_path = 'llama_distinguish_answer.txt'
neg_to_pos = parse_recommendations(file_path)

for user_id, movies in neg_to_pos.items():
    print(f"user_id {user_id}: {movies}")

In [None]:
total = 0
user_ids = list(neg_to_pos.keys())
for user_id in user_ids:
    total += len(neg_to_pos[user_id])
total

In [None]:
users_id = list(neg_to_pos.keys())
total = 0

for user_id in users_id:
    real = train[train['user_id:token']==user_id]['title:token'].values
    augment = np.array(neg_to_pos[user_id])

    difference = len(list(set(augment)-set(real)))

    if difference > 0:
        total += difference
total

In [None]:
is_target_user = train['user_id:token'].isin(neg_to_pos.keys())

train['is_neg_to_pos'] = train.apply(
    lambda row: row['title:token'] in neg_to_pos.get(row['user_id:token'], []),
    axis=1)

is_rating_2_or_3 = train['rating:float'].isin([2, 3])

train.loc[is_target_user & train['is_neg_to_pos'] & is_rating_2_or_3, 'rating:float'] = 4.5
train.loc[is_target_user & ~train['is_neg_to_pos'] & is_rating_2_or_3, 'rating:float'] = 0.5

train.drop(columns='is_neg_to_pos', inplace=True)

In [None]:
augment = np.unique(train['rating:float'].values, return_counts=True)
original, augment

In [None]:
augment[1][-2] / (augment[1][0] + augment[1][-2])

In [None]:
np.sum(augment[1][4:])

In [None]:
train.to_csv('Netflix-1M_train_augment.csv', index=False)