In [1]:
# ML
from sklearn.model_selection import train_test_split

# data processing
import pandas as pd

# system and settings
import numpy as np
from google.colab import drive
import os

In [2]:
SEED = 42
np.random.seed(SEED)

In [3]:
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
# finding folder path
def find_folder(root, target):
    for root, dirs, _ in os.walk(root):
        if target in dirs:
            return os.path.join(root, target)
    return ValueError("None\nDownload the Dataset or change the target Directory")

root = '/'
target = 'datasets_imdb'

PATH = find_folder(root, target)
print(f"path to dataset's folders --> {PATH}")

path to dataset's folders --> /content/drive/MyDrive/ic_recsys/imdb_recommender/datasets_imdb


# Train & Test

In [5]:
## Movie, User & Rating
rating = pd.read_csv(f"{PATH}/raw/movielens_20m_datasets/rating.csv")
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
# It will be used half of the dataset
# because it was too expensive to train
# the model (RAM PROBLEMS)

rating_size = int(rating.shape[0] * .2) + 1
rating = rating.iloc[:rating_size]

In [7]:
# the necessary columns to train the model
rating = rating[["userId", "movieId", "rating"]]

In [9]:
X = rating.iloc[:, :-1]
y = rating.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [10]:
print("test size --> {:,}".format(test.shape[0]).replace(',', '.'))
print("train size --> {:,}".format(train.shape[0]).replace(',', '.'))

test size --> 800.011
train size --> 3.200.042


# Saving Files

In [11]:
test.to_csv(f"{PATH}/refined/test.csv", index=False, header=False)
train.to_csv(f"{PATH}/refined/train.csv", index=False, header=False)

rating.to_csv(f"{PATH}/refined/all_interactions.csv", index=False, header=False)