# This notebook will aim to synthesize a collaborative filtering recommendation system dataset containing over 7000 users and using the 4000+ jurisprudence docs as items

In [1]:
from utils.data_loaders import load_juris_meta

import numpy as np
import pandas as pd
import time
import math

from concurrent.futures import ThreadPoolExecutor

In [2]:
labor_related_docs_meta = load_juris_meta('../raw labor related jurisprudence cleaning/labor_related_docs_meta.csv')
labor_related_docs_meta

Unnamed: 0,abs_file_path,answer,year_range,title,file_name,year,month,day,gr_number,division,case_code,id
0,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,1901-1920,"TATSUSABURO YEGAWA, PLAINTIFF AND APPELLEE, VS...",126340044732358.html,1906,11,22,g.r. no. 3388,,6 phil. 750,1
1,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,1901-1920,"T. SUGO AND K. SHIBATA, PLAINTIFFS AND APPELLE...",12634004479921.html,1906,11,22,g.r. no. 3387,,6 phil. 744,2
2,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,1901-1920,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",12637977142035130542.html,1904,4,9,g.r. no. 1559,,3 phil. 630,3
3,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,1901-1920,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",1263862677733972151.html,1904,4,9,g.r. no. 1585,,3 phil. 631,4
4,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,1901-1920,"THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",12638653111436063074.html,1904,4,9,g.r. no. 1586,,3 phil. 633,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4223,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,2020,RINGO B. DAYOWAN TRANSPORT SERVICES OR RINGO B...,16151712921004948288.html,2020,11,10,g.r. no. 226409,FIRST DIVISION,,4224
4224,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,2020,MINA C. NACILLA AND THE LATE ROBERTO C. JACOBE...,1615183910987321937.html,2020,11,10,g.r. no. 223449,FIRST DIVISION,,4225
4225,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,2020,"ALCID C. BALBARINO (NOW DECEASED), SUBSTITUTED...",1615948352512956378.html,2020,9,21,g.r. no. 201580,THIRD DIVISION,,4226
4226,d:/Projects/To Github/LaRJ-Corpus/raw labor re...,LABOR RELATED,2020,"TEAM PACIFIC CORPORATION, FEDERICO M. FERNANDE...",1615955399598086487.html,2020,7,15,g.r. no. 206789,THIRD DIVISION,,4227


In [3]:
num_docs = labor_related_docs_meta.shape[0]
num_users = 7012

In [4]:
labor_related_docs_meta.dtypes

abs_file_path    object
answer           object
year_range       object
title            object
file_name        object
year              int64
month             int64
day               int64
gr_number        object
division         object
case_code        object
id                int64
dtype: object

In [5]:
# use this for in order to replicate same random numbers turn it off
# should synthesis of dataset commence
np.random.seed(0)

# pseudo hyper params
# the range in which the random number of docs are to be generated from for each user
# e.g. user 1 is generated 40 docs to rate, user 20 is generated 10 docs to rate, so on
min_num_of_docs_to_rate = 20
# max_num_of_docs_to_rate = 300

# # high arg of np.random.randint is exclusive that is why we add one to 
# # 80 to ensure numbers generated range from 10 to 80 and not 10 to 79 only
# num_of_docs_to_rate = np.random.randint(min_num_of_docs_to_rate, max_num_of_docs_to_rate + 1, 1, dtype=np.int32)[0]
# print(f"num of docs to rate {num_of_docs_to_rate}")

In [6]:
# # use 0 to 4227 as the range of id's to generate the docs for each user
# rand_idx = np.random.choice(labor_related_docs_meta.index, size=num_of_docs_to_rate)

# random_items = labor_related_docs_meta.iloc[rand_idx]
# print(f"random_items size: {random_items.shape[0]}")
# random_items

In [7]:
labor_related_docs_meta.loc[:, 'id']

0          1
1          2
2          3
3          4
4          5
        ... 
4223    4224
4224    4225
4225    4226
4226    4227
4227    4228
Name: id, Length: 4228, dtype: int64

In [8]:
def helper(user):
    """
    note: this function will occassionaly fluctuate to have a user rate
    synthetically 3000 plus items to add outliers

    args:
        user - the id of the user during concurrent process ranging
        from 1 to n_u
    """
    rand_max_n_docs_gen = [124, 1514, 200, 343, 421, 120, 99, 3204, 1249]
    max_num_of_docs_to_rate = rand_max_n_docs_gen[np.random.randint(0, 9)]

    num_of_docs_to_rate = np.random.randint(min_num_of_docs_to_rate, max_num_of_docs_to_rate + 1, 1, dtype=np.int32)[0]
    
    user_ids = (np.ones(shape=num_of_docs_to_rate, dtype=np.int32) * user).tolist()

    # sample the indeces of the labor related docs strictly without replacement
    # so we only have unique documetn indeces that a user has "interacted" with
    rand_indeces = np.random.choice(labor_related_docs_meta.index, size=num_of_docs_to_rate, replace=False)
    random_items = labor_related_docs_meta.loc[rand_indeces, 'id'].tolist()

    random_ratings = np.random.randint(1, 5 + 1, size=num_of_docs_to_rate, dtype=np.int32).tolist()
    
    return pd.DataFrame({'user_id': user_ids, 'item_id': random_items, 'rating': random_ratings})

In [9]:
with ThreadPoolExecutor() as executor:
    results = list(executor.map(helper, np.arange(num_users) + 1))

In [10]:
results[0]

Unnamed: 0,user_id,item_id,rating
0,1,3459,1
1,1,1022,3
2,1,1186,3
3,1,1099,1
4,1,411,1
...,...,...,...
79,1,2113,3
80,1,2591,5
81,1,1941,2
82,1,3738,3


In [11]:
len(results)

7012

In [12]:
# concatenate all dfs created concurrently
rating_df = pd.concat(results, axis=0)

# shuffle the rows of the rating_df
# rating_df = rating_df.sample(frac=1)

# reset index
rating_df.reset_index(drop=True, inplace=True)

In [13]:
rating_df

Unnamed: 0,user_id,item_id,rating
0,1,3459,1
1,1,1022,3
2,1,1186,3
3,1,1099,1
4,1,411,1
...,...,...,...
2920593,7012,2043,5
2920594,7012,273,4
2920595,7012,3170,4
2920596,7012,2323,2


In [15]:
est_num_rows = round(rating_df.shape[0], -3)
rating_df.to_csv(f'juris_{est_num_rows}_ratings.csv')
np.savetxt(f'juris_{est_num_rows}_ratings.txt', rating_df, fmt='%d')