In [1]:
import numpy as np
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import pandas as pd
import json

In [3]:
# dataset_dict = load_dataset('HUPD/hupd',
#     name='sample',
#     data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
#     icpr_label=None,
#     train_filing_start_date='2016-01-01',
#     train_filing_end_date='2016-01-21',
#     val_filing_start_date='2016-01-22',
#     val_filing_end_date='2016-01-31',
# )

# print('Loading is done!')

In [4]:
# !tar -xzvf 2015.tar.gz

In [5]:
# !tar -xzvf 2016.tar.gz

In [6]:
# !tar -xzvf 2017.tar.gz

In [7]:
lst_of_columns = ['patent_number',
        'decision',
        'title',
        'abstract',
        'claims',
        'background',
        'summary',
        'full_description',
        'main_cpc_label',
        'main_ipcr_label',
        'filing_date',
        'patent_issue_date',
        'date_published',
        'examiner_id']

In [8]:
def to_dataframe(directory):
    df_year = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file = os.path.join(directory, filename)
            with open(file, 'r') as f:
                data = json.load(f)
            df = pd.json_normalize(data,meta=lst_of_columns,errors='ignore')
            df = df[lst_of_columns]
            df_year.append(df)
    df_year = pd.concat(df_year, ignore_index=True)
    return df_year

In [9]:
dataset_name = "hupd" # change this
data_dir = "/home/users/rz95/"
df_2015 = to_dataframe(os.path.join(data_dir, "2015"))

In [10]:
# df_2016 = to_dataframe(os.path.join(data_dir, "2016"))

In [11]:
# df_2017 = to_dataframe(os.path.join(data_dir, "2017"))

: 

: 

In [None]:
df_2015.head()

Unnamed: 0,patent_number,decision,title,abstract,claims,background,summary,full_description,main_cpc_label,main_ipcr_label,filing_date,patent_issue_date,date_published,examiner_id
0,,PENDING,APPARATUS AND PROCESSES FOR A MOBILE DEVICE CA...,A mobile device case containing one or more li...,1. A mobile device case comprising: a one or m...,<SOH> BACKGROUND OF THE INVENTION <EOH>Technol...,,CROSS REFERENCE TO RELATED APPLICATION This ap...,H04N52256,H04N5225,20151019,,20160421,91244.0
1,9951638.0,ACCEPTED,SHAPED RIM CAVITY WING SURFACE,A shaped rim cavity wing includes an upper sur...,1. A shaped rim cavity wing comprising: a body...,<SOH> BACKGROUND <EOH>The present invention is...,<SOH> SUMMARY <EOH>To prevent air from the rim...,BACKGROUND The present invention is related to...,F01D11001,F01D1100,20150921,20180424.0,20160114,76201.0
2,9594864.0,ACCEPTED,METHOD FOR ASYMMETRICAL GEOMETRICAL SCALING,A circuit layout data has a start value of a f...,"1. A method for scaling an integrated circuit,...",<SOH> BACKGROUND <EOH>In conventional semicond...,<SOH> SUMMARY <EOH>This Summary identifies fea...,FIELD OF DISCLOSURE The present application is...,G06F175072,G06F1750,20150422,20170314.0,20161027,94249.0
3,9782999.0,ACCEPTED,MISMATCH SIDE RIM AND LOCK RING INTERFACE ANGL...,Systems and methods disclosed herein may be us...,1. A lock ring wheel assembly comprising: a wh...,<SOH> BACKGROUND <EOH>Aircraft wheels often co...,<SOH> SUMMARY <EOH>A lock ring wheel assembly ...,FIELD The present disclosure is related to a l...,B60B2302,B60B2302,20151020,20171010.0,20170420,98691.0
4,,REJECTED,Spy Block,The Spy Block invention is an innovative produ...,1. Spy Block's unique plastic design will be e...,"<SOH> BACKGROUND OF THE INVENTION <EOH>Dec. 1,...",<SOH> BRIEF SUMMARY OF THE INVENTION <EOH>Spy ...,"BACKGROUND OF THE INVENTION Dec. 1, 2013, whil...",H04R1083,H04R108,20150909,,20170309,76968.0


In [None]:
df_2015.columns

Index(['patent_number', 'decision', 'title', 'abstract', 'claims',
       'background', 'summary', 'full_description', 'main_cpc_label',
       'main_ipcr_label', 'filing_date', 'patent_issue_date', 'date_published',
       'examiner_id'],
      dtype='object')

In [None]:
# yet to be processed: main_cpc_label, main_ipcr_label
def preprocess(df):
    def decision_map(dec):
        dic = {
            'REJECTED': 0, 
            'ACCEPTED': 1, 
            'PENDING': 2, 
            'CONT-REJECTED': 3, 
            'CONT-ACCEPTED': 4, 
            'CONT-PENDING': 5
        }
        return dic[dec]
    df["decision"] = df["decision"].apply(decision_map)
        
    def date_process(column):
        df[column] = pd.to_datetime(df[column])
        df[column.replace("date", "time")] = df[column].apply(lambda x:(x.year, x.month))
    date_process("filing_date")
    date_process("patent_issue_date")
    date_process("date_published")
    return df


In [None]:
preprocess(df_2015)
df_2015.head()

Unnamed: 0,patent_number,decision,title,abstract,claims,background,summary,full_description,main_cpc_label,main_ipcr_label,filing_date,patent_issue_date,date_published,examiner_id,filing_time,patent_issue_time,time_published
0,,2,APPARATUS AND PROCESSES FOR A MOBILE DEVICE CA...,A mobile device case containing one or more li...,1. A mobile device case comprising: a one or m...,<SOH> BACKGROUND OF THE INVENTION <EOH>Technol...,,CROSS REFERENCE TO RELATED APPLICATION This ap...,H04N52256,H04N5225,2015-10-19,NaT,2016-04-21,91244.0,"(2015, 10)","(nan, nan)","(2016, 4)"
1,9951638.0,1,SHAPED RIM CAVITY WING SURFACE,A shaped rim cavity wing includes an upper sur...,1. A shaped rim cavity wing comprising: a body...,<SOH> BACKGROUND <EOH>The present invention is...,<SOH> SUMMARY <EOH>To prevent air from the rim...,BACKGROUND The present invention is related to...,F01D11001,F01D1100,2015-09-21,2018-04-24,2016-01-14,76201.0,"(2015, 9)","(2018, 4)","(2016, 1)"
2,9594864.0,1,METHOD FOR ASYMMETRICAL GEOMETRICAL SCALING,A circuit layout data has a start value of a f...,"1. A method for scaling an integrated circuit,...",<SOH> BACKGROUND <EOH>In conventional semicond...,<SOH> SUMMARY <EOH>This Summary identifies fea...,FIELD OF DISCLOSURE The present application is...,G06F175072,G06F1750,2015-04-22,2017-03-14,2016-10-27,94249.0,"(2015, 4)","(2017, 3)","(2016, 10)"
3,9782999.0,1,MISMATCH SIDE RIM AND LOCK RING INTERFACE ANGL...,Systems and methods disclosed herein may be us...,1. A lock ring wheel assembly comprising: a wh...,<SOH> BACKGROUND <EOH>Aircraft wheels often co...,<SOH> SUMMARY <EOH>A lock ring wheel assembly ...,FIELD The present disclosure is related to a l...,B60B2302,B60B2302,2015-10-20,2017-10-10,2017-04-20,98691.0,"(2015, 10)","(2017, 10)","(2017, 4)"
4,,0,Spy Block,The Spy Block invention is an innovative produ...,1. Spy Block's unique plastic design will be e...,"<SOH> BACKGROUND OF THE INVENTION <EOH>Dec. 1,...",<SOH> BRIEF SUMMARY OF THE INVENTION <EOH>Spy ...,"BACKGROUND OF THE INVENTION Dec. 1, 2013, whil...",H04R1083,H04R108,2015-09-09,NaT,2017-03-09,76968.0,"(2015, 9)","(nan, nan)","(2017, 3)"


In [None]:
# preprocess(df_2016)
# preprocess(df_2017)
# df_2017.head()

Unnamed: 0,patent_number,decision,title,abstract,claims,background,summary,full_description,main_cpc_label,main_ipcr_label,filing_date,patent_issue_date,date_published,examiner_id,filing_time,patent_issue_time,time_published
0,,2,SYSTEMS AND METHODS PROVIDING CENTRALIZED MEDI...,Systems and methods provide credentialing and ...,1. A system for providing credentialing and pr...,<SOH> BACKGROUND <EOH>The U.S. Department of H...,<SOH> SUMMARY OF THE EMBODIMENTS <EOH>The foll...,CROSS REFERENCE TO RELATED APPLICATIONS This p...,G16H1060,G16H1060,2017-10-05,NaT,2018-05-03,71267.0,"(2017, 10)","(nan, nan)","(2018, 5)"
1,10009147.0,1,APPARATUS AND METHOD FOR SENDING AND RECEIVING...,A broadcast signal receiver is provided. The b...,"1. A broadcast signal receiver, comprising: a ...",<SOH> BACKGROUND OF THE INVENTION <EOH>The pre...,<SOH> SUMMARY OF THE INVENTION <EOH>A broadcas...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,H04L10071,H04L100,2017-03-03,2018-06-26,2017-06-22,67619.0,"(2017, 3)","(2018, 6)","(2017, 6)"
2,,2,Method of processing and fractionating biomass...,The present invention relates to a method of t...,"1. A method of fractionation of biomass, in pa...",<SOH> BACKGROUND ART <EOH>Conversion of biomas...,<SOH> SUMMARY OF INVENTION <EOH>,TECHNICAL FIELD The present invention relates ...,C08B370057,C08B3700,2017-04-24,NaT,2017-10-19,66532.0,"(2017, 4)","(nan, nan)","(2017, 10)"
3,,2,PHASE MEASURING DEVICE AND APPARATUSES USING T...,The inventive phase measuring device includes ...,1. A phase measuring device for measuring a ph...,<SOH> BACKGROUND ART <EOH>A device that measur...,<SOH> SUMMARY <EOH>,TECHNICAL FIELD The present invention relates ...,H04L272331,H04L27233,2017-05-12,NaT,2017-11-09,58719.0,"(2017, 5)","(nan, nan)","(2017, 11)"
4,,2,"TIMEPIECE, METHOD OF DISPLAY CONTROL, AND STOR...",A timepiece includes one or more processors; a...,"1. An electronic device, comprising: one or mo...",<SOH> BACKGROUND OF THE INVENTION <EOH>,<SOH> SUMMARY OF THE INVENTION <EOH>The presen...,BACKGROUND OF THE INVENTION Technical Field Th...,G04G99006,G04G9900,2017-12-28,NaT,2018-06-28,70323.0,"(2017, 12)","(nan, nan)","(2018, 6)"


In [None]:
data_dir = "/usr/project/xtmp/rz95/InterpretableQA-LLMTools/data/external_corpus"
df_2015.to_csv(os.path.join(data_dir, dataset_name, "hupd_2015.csv"), index=False) 

In [None]:
# df_2016.to_csv(os.path.join(data_dir, dataset_name, "hupd_2016.csv"), index=False) 

In [None]:
# df_2017.to_csv(os.path.join(data_dir, dataset_name, "hupd_2017.csv"), index=False)