<a href="https://colab.research.google.com/github/CuriousGu/llm_zeroshot_calibration/blob/dev/spliting_train_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ML
from sklearn.model_selection import train_test_split

# data processing
import pandas as pd

# system and settings
import numpy as np
from google.colab import drive
import os

In [None]:
SEED = 42
np.random.seed(SEED)

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# finding folder path
def find_folder(root, target):
    for root, dirs, _ in os.walk(root):
        if target in dirs:
            return os.path.join(root, target)
    return ValueError("None\nDownload the Dataset or change the target Directory")

root = '/'
target = 'datasets_imdb'

PATH = find_folder(root, target)
print(f"path to dataset's folders --> {PATH}")

path to dataset's folders --> /content/drive/MyDrive/ic_recsys/imdb_recommender/datasets_imdb


# KNN e BprMF

In this case, the data will be separed following no solid rule. It will be just applied the "train_test_split" from SKLearn.

## Train & Test

In [None]:
## Movie, User & Rating
rating = pd.read_csv(f"{PATH}/raw/movielens_20m_datasets/rating.csv")
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
# It will be used half of the dataset
# because it was too expensive to train
# the model (RAM PROBLEMS)

rating_size = int(rating.shape[0] * .2) + 1
rating = rating.iloc[:rating_size]

In [None]:
# the necessary columns to train the model
rating = rating[["userId", "movieId", "rating"]]

In [None]:
X = rating.iloc[:, :-1]
y = rating.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [None]:
print("test size --> {:,}".format(test.shape[0]).replace(',', '.'))
print("train size --> {:,}".format(train.shape[0]).replace(',', '.'))

test size --> 800.011
train size --> 3.200.042


## Saving Files

In [None]:
test.to_csv(f"{PATH}/refined/test.csv", index=False, header=False)
train.to_csv(f"{PATH}/refined/train.csv", index=False, header=False)

rating.to_csv(f"{PATH}/refined/all_interactions.csv", index=False, header=False)

# LLM

To input the LLM, the data will be separed considered the last interactions from each user.

## Train & Test

In [None]:
# Separing the users with most recent interactions
most_recent_users = rating.sort_values(by=["timestamp"])
most_recent_users = most_recent_users["userId"].unique()
llm_rating = rating[rating.userId.isin(most_recent_users)]
llm_rating.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07


In [None]:
# Each user need to have at least 20 interactions - 10 to train and 10 to evaluate
users_interactions_count = llm_rating.groupby("userId").count()

# I won't select users with too much iteractions, because generic recomendation will
# be considered as good recs. So, I'll try to limitate
users_interactions_count = users_interactions_count[
                                                    (users_interactions_count.movieId >= 20) &
                                                    (users_interactions_count.movieId <= 100)
                                                    ].reset_index()

# Sorting values to be easier to select the users with more interactions
users_interactions = users_interactions_count.sort_values(by="movieId", ascending=False)
users_interactions = users_interactions['userId'].unique()[:400]
llm_rating = llm_rating[llm_rating.userId.isin(users_interactions)]

## Merging - Movie Name

In [None]:
movies_names = pd.read_csv(f"{PATH}/raw/movielens_20m_datasets/movie.csv")
movies_names = movies_names[['movieId', 'title']]
movies_names.head(4)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)


In [None]:
llm_dataset = pd.merge(llm_rating, movies_names, on="movieId", how='left')
llm_dataset.head(4)

Unnamed: 0,userId,movieId,rating,timestamp,title
0,48,1233,5.0,1997-04-10 08:53:43,"Boot, Das (Boat, The) (1981)"
1,48,380,3.0,1996-12-16 04:43:58,True Lies (1994)
2,48,1250,4.0,1996-12-16 04:43:58,"Bridge on the River Kwai, The (1957)"
3,48,369,4.0,1996-12-16 04:43:57,Mrs. Parker and the Vicious Circle (1994)


In [None]:
trainset = (
    llm_dataset
    .sort_values(by=['userId', 'timestamp'], ascending=[True, False])
    .groupby('userId').head(10)
)

testset = llm_dataset.drop(trainset.index)

print(f"trainset size --> {trainset.shape[0]} rows\ntestset size --> {testset.shape[0]} rows")

trainset size --> 4000 rows
testset size --> 35523 rows


## Saving Files

In [None]:
testset.to_csv(f"{PATH}/refined/test_llm.csv", index=False, header=False)
trainset.to_csv(f"{PATH}/refined/train_llm.csv", index=False, header=False)