# This notebook will aim to synthesize a collaborative filtering recommendation system dataset containing over 7000 users and using the 4000+ jurisprudence docs as items

In [347]:
from utils.data_loaders import load_juris_meta

import numpy as np
import pandas as pd
import time

from concurrent.futures import ThreadPoolExecutor

In [348]:
labor_related_docs_meta = load_juris_meta('./labor_related_docs_meta.csv')
labor_related_docs_meta

Unnamed: 0,abs_file_path,answer,year_range,title,file_name,year,month,day,gr_number,division,case_code,id
0,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,1901-1920,"TATSUSABURO YEGAWA, PLAINTIFF AND APPELLEE, VS...",126340044732358.html,1906,11,22,g.r. no. 3388,,6 phil. 750,1
1,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,1901-1920,"T. SUGO AND K. SHIBATA, PLAINTIFFS AND APPELLE...",12634004479921.html,1906,11,22,g.r. no. 3387,,6 phil. 744,2
2,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,1901-1920,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",12637977142035130542.html,1904,4,9,g.r. no. 1559,,3 phil. 630,3
3,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,1901-1920,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",1263862677733972151.html,1904,4,9,g.r. no. 1585,,3 phil. 631,4
4,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,1901-1920,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",12638653111436063074.html,1904,4,9,g.r. no. 1586,,3 phil. 633,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4223,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2020,RINGO B. DAYOWAN TRANSPORT SERVICES OR RINGO B...,16151712921004948288.html,2020,11,10,g.r. no. 226409,FIRST DIVISION,,4224
4224,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2020,MINA C. NACILLA AND THE LATE ROBERTO C. JACOBE...,1615183910987321937.html,2020,11,10,g.r. no. 223449,FIRST DIVISION,,4225
4225,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2020,"ALCID C. BALBARINO (NOW DECEASED), SUBSTITUTED...",1615948352512956378.html,2020,9,21,g.r. no. 201580,THIRD DIVISION,,4226
4226,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2020,"TEAM PACIFIC CORPORATION, FEDERICO M. FERNANDE...",1615955399598086487.html,2020,7,15,g.r. no. 206789,THIRD DIVISION,,4227


In [349]:
labor_related_docs_meta.dtypes

abs_file_path    object
answer           object
year_range       object
title            object
file_name        object
year              int64
month             int64
day               int64
gr_number        object
division         object
case_code        object
id                int64
dtype: object

In [350]:
# # use this for in order to replicate same random numbers turn it off
# # should synthesis of dataset commence
# seed = round(time.time())
# np.random.seed(seed)

# pseudo hyper params
# the range in which the random number of docs are to be generated from for each user
# e.g. user 1 is generated 40 docs to rate, user 20 is generated 10 docs to rate, so on
min_num_of_docs_to_rate = 10
max_num_of_docs_to_rate = 80

# high arg of np.random.randint is exclusive that is why we add one to 
# 80 to ensure numbers generated range from 10 to 80 and not 10 to 79 only
num_of_docs_to_rate = np.random.randint(10, 80 + 1, 1, dtype=np.int32)[0]
print(f"num of docs to rate {num_of_docs_to_rate}")

num of docs to rate 62


In [351]:
# seed = round(time.time())
# np.random.seed(seed)

# use 0 to 4227 as the range of id's to generate the docs for each user
rand_idx = np.random.choice(labor_related_docs_meta.index, size=num_of_docs_to_rate)

random_items = labor_related_docs_meta.iloc[rand_idx]
print(f"random_items size: {random_items.shape[0]}")
random_items

random_items size: 62


Unnamed: 0,abs_file_path,answer,year_range,title,file_name,year,month,day,gr_number,division,case_code,id
3899,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2018,"SHERYLL R. CABAÑAS, PETITIONER, VS. ABELARDO G...",1539762105776554381.html,2018,7,2,g.r. no. 225803,SECOND DIVISION,,3900
2609,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2010,"PICOP RESOURCES, INCORPORATED (PRI), PETITIONE...",1282618146344208083.html,2010,8,9,g.r. no. 160828,SECOND DIVISION,641 phil. 175,2610
3743,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2017,"JULIO C. ESPERE, PETITIONER, V. NFD INTERNATIO...",1505700359199294961.html,2017,7,26,g.r. no. 212098,SECOND DIVISION,814 phil. 820,3744
1059,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2003,"EXECUTIVE LABOR ARBITER RICARDO N. OLAIREZ, PE...",1232502210565684101.html,2003,3,10,g.r. no. 148030,FIRST DIVISION,447 phil. 56,1060
1433,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2005,"ENRIQUE “TOTOY” RIVERA Y DE GUZMAN PETITIONER,...",a45475a11ec72b843d74959b60fd7bd6464c94395da80....,2005,6,30,g.r. no. 138553,THIRD DIVISION,501 phil. 37,1434
...,...,...,...,...,...,...,...,...,...,...,...,...
3960,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2019,RE: DROPPING FROM THE ROLLS OF LAYDABELL G. PI...,155297357651385499.html,2019,1,7,a.m. no. 18-07-153-rtc,SECOND DIVISION,,3961
1735,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2006,ST. LOUIS UNIVERSITY LABORATORY HIGH SCHOOL (S...,a45475a11ec72b843d74959b60fd7bd6464b96038798f....,2006,8,28,a.c. no. 6010,EN BANC,531 phil. 213,1736
3639,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2016,"ALMA COVITA, FOR HER BEHALF AND IN BEHALF OF H...",14842757221094725476.html,2016,12,7,g.r. no. 206600,THIRD DIVISION,802 phil. 598,3640
972,d:/Projects/To Github/LaRJ-Corpus/raw jurispru...,LABOR RELATED,2003,IN RE: PETITION TO DISQUALIFY ATTY. LEONARD DE...,12239653561163782375.html,2003,12,11,a.c. no. 6052,EN BANC,463 phil. 385,973
