In [1]:
import itertools
import pandas as pd

from helpers import remove_tags, seperate_ontology, classification_label

# Loading and checking the data set

In [2]:
data = pd.read_excel('../data/0_raw/RegInsight_Dataset.xlsx', engine='openpyxl')

In [3]:
data.head(5)

Unnamed: 0,CUBEJurisdiction,CUBEIssuingBody,CUBEIssuingDepartment,CUBEPublishedDate,RegInsightDocumentId,RegInsightSourceLink,IssuanceType,Status,RegInsightTitleNative,RegInsightTextNative,RegOntologyId
0,United States of America,United States of America Government,Civil Rights Commission,2021-01-01 00:57:34.107,0F44333D-E462-4CF5-86AE-C3AF781DDABF,https://www.federalregister.gov/documents/2020...,Publication,,Agenda and Notice of Public Meeting of the New...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...
1,United States of America,United States of America Government,Environmental Protection Agency (EPA),2021-01-01 00:57:34.107,36CAA786-C1DE-44A9-B9E0-C7CF71F8B2A9,https://www.federalregister.gov/documents/2020...,Regulation,,Proposed Deletion From the National Priorities...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...
2,United States of America,United States of America Government,Transportation Department,2021-01-01 00:57:34.107,7A882AC1-E99C-4AE0-8A70-3ABE3893DC5F,https://www.federalregister.gov/documents/2020...,Publication,,Petition for Exemption; Summary of Petition Re...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...
3,United States of America,United States of America Government,National Science Foundation,2021-01-01 00:57:34.107,5D02101D-6496-4D87-AB89-BDD2F62E06A1,https://www.federalregister.gov/documents/2020...,Publication,,Alan T. Waterman Award Committee; Notice of Me...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...
4,United States of America,United States of America Government,Department of Justice,2021-01-01 04:14:29.283,34E09CE1-5BE7-4030-B1F4-4AA52DF2997D,https://www.govinfo.gov/content/pkg/FR-2020-12...,Publication,Completed,"Federal Register / Vol. 85, No. 251 / Thursday...","Federal Register / Vol. 85, No. 251 / Thursday...",0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...


In [4]:
data.describe()

  """Entry point for launching an IPython kernel.


Unnamed: 0,CUBEJurisdiction,CUBEIssuingBody,CUBEIssuingDepartment,CUBEPublishedDate,RegInsightDocumentId,RegInsightSourceLink,IssuanceType,Status,RegInsightTitleNative,RegInsightTextNative,RegOntologyId
count,8693,8693,2654,8693,8693,8693,8693,2813,8693,8693,8693
unique,8,12,152,5009,8693,8625,80,4,7060,8523,4404
top,United States of America,United States of America Government,U.S. Securities Exchange Commission (SEC),2021-03-10 17:20:20.337000,2C1A315E-FBBD-4866-A1A4-595BC6B13BDD,https://www.federalregister.gov/documents/2021...,Notice,Completed,UNITED STATES OF AMERICA Before the SECURITIES...,Not machine-readable,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...
freq,6343,2833,1072,42,1,3,1909,1254,75,21,550
first,,,,2021-01-01 00:57:34.107000,,,,,,,
last,,,,2021-08-05 04:18:54.400000,,,,,,,


In [5]:
print(data.isnull().values.any())
print(data.isna().sum())

True
CUBEJurisdiction            0
CUBEIssuingBody             0
CUBEIssuingDepartment    6039
CUBEPublishedDate           0
RegInsightDocumentId        0
RegInsightSourceLink        0
IssuanceType                0
Status                   5880
RegInsightTitleNative       0
RegInsightTextNative        0
RegOntologyId               0
dtype: int64


In [6]:
print(data[data.duplicated()])

Empty DataFrame
Columns: [CUBEJurisdiction, CUBEIssuingBody, CUBEIssuingDepartment, CUBEPublishedDate, RegInsightDocumentId, RegInsightSourceLink, IssuanceType, Status, RegInsightTitleNative, RegInsightTextNative, RegOntologyId]
Index: []


# Data cleaning

In [7]:
data['RegInsightTextNative_Clean']=data['RegInsightTextNative'].apply(lambda cw : remove_tags(cw))
data['RegOntologyId_Clean']=data['RegOntologyId'].apply(lambda cw : seperate_ontology(cw))

# Jurisdictions covered

In [8]:
jurisdictions = data['CUBEJurisdiction'].unique().tolist()
print(f'{len(jurisdictions)} jurisdictions are covered in this data set, these being: {", ".join(jurisdictions)}.')

8 jurisdictions are covered in this data set, these being: United States of America, Pakistan, Hong Kong, Canada, United Kingdom, US State - New York, Singapore, Australia.


In [9]:
NorthAmerica = ['United States of America', 'Canada', 'US State - New York']
df_NorthAmerica = data[data['CUBEJurisdiction'].isin(NorthAmerica)]
df_NorthAmerica.head(5)

Unnamed: 0,CUBEJurisdiction,CUBEIssuingBody,CUBEIssuingDepartment,CUBEPublishedDate,RegInsightDocumentId,RegInsightSourceLink,IssuanceType,Status,RegInsightTitleNative,RegInsightTextNative,RegOntologyId,RegInsightTextNative_Clean,RegOntologyId_Clean
0,United States of America,United States of America Government,Civil Rights Commission,2021-01-01 00:57:34.107,0F44333D-E462-4CF5-86AE-C3AF781DDABF,https://www.federalregister.gov/documents/2020...,Publication,,Agenda and Notice of Public Meeting of the New...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...,AGENCY: Commission on Civil Rights. ACTIO...,"[Market Wide Requirements, HR & Labour Law]"
1,United States of America,United States of America Government,Environmental Protection Agency (EPA),2021-01-01 00:57:34.107,36CAA786-C1DE-44A9-B9E0-C7CF71F8B2A9,https://www.federalregister.gov/documents/2020...,Regulation,,Proposed Deletion From the National Priorities...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...,AGENCY: Environmental Protection Agency (...,"[Market Wide Requirements, Definition of Sensi..."
2,United States of America,United States of America Government,Transportation Department,2021-01-01 00:57:34.107,7A882AC1-E99C-4AE0-8A70-3ABE3893DC5F,https://www.federalregister.gov/documents/2020...,Publication,,Petition for Exemption; Summary of Petition Re...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...,AGENCY: Federal Aviation Administration (...,"[Market Wide Requirements, Registration / Lice..."
3,United States of America,United States of America Government,National Science Foundation,2021-01-01 00:57:34.107,5D02101D-6496-4D87-AB89-BDD2F62E06A1,https://www.federalregister.gov/documents/2020...,Publication,,Alan T. Waterman Award Committee; Notice of Me...,<div> <span></span> </div> <div></div> <span><...,0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...,In accordance with the Federal Advisory C...,"[Market Wide Requirements, Governance, Registr..."
4,United States of America,United States of America Government,Department of Justice,2021-01-01 04:14:29.283,34E09CE1-5BE7-4030-B1F4-4AA52DF2997D,https://www.govinfo.gov/content/pkg/FR-2020-12...,Publication,Completed,"Federal Register / Vol. 85, No. 251 / Thursday...","Federal Register / Vol. 85, No. 251 / Thursday...",0F0C4195-41A8-4828-9A80-4EE7B778B157_Market Wi...,"Federal Register / Vol. 85, No. 251 / Thursday...","[Market Wide Requirements, Money Transmission ..."


In [10]:
df = pd.DataFrame(df_NorthAmerica, columns=['CUBEIssuingDepartment', 'RegInsightTitleNative', 'RegInsightTextNative_Clean', 'RegOntologyId_Clean'])
df.head(5)

Unnamed: 0,CUBEIssuingDepartment,RegInsightTitleNative,RegInsightTextNative_Clean,RegOntologyId_Clean
0,Civil Rights Commission,Agenda and Notice of Public Meeting of the New...,AGENCY: Commission on Civil Rights. ACTIO...,"[Market Wide Requirements, HR & Labour Law]"
1,Environmental Protection Agency (EPA),Proposed Deletion From the National Priorities...,AGENCY: Environmental Protection Agency (...,"[Market Wide Requirements, Definition of Sensi..."
2,Transportation Department,Petition for Exemption; Summary of Petition Re...,AGENCY: Federal Aviation Administration (...,"[Market Wide Requirements, Registration / Lice..."
3,National Science Foundation,Alan T. Waterman Award Committee; Notice of Me...,In accordance with the Federal Advisory C...,"[Market Wide Requirements, Governance, Registr..."
4,Department of Justice,"Federal Register / Vol. 85, No. 251 / Thursday...","Federal Register / Vol. 85, No. 251 / Thursday...","[Market Wide Requirements, Money Transmission ..."


# Ontoligy classifications

I found that there are 273 unique Ontological classifications made by the RegInsite tool that are included within this data set, with some text artifacts having up to 139 labelles applied to them!

In [11]:
classifications = list(itertools.chain(*data['RegOntologyId_Clean'].tolist()))
print(f'A maximum of {data["RegOntologyId_Clean"].str.len().max()} classifications for one row.')
print(f'A total of {len(set(classifications))} Ontological categories.')
classifications[:5]


A maximum of 139 classifications for one row.
A total of 273 Ontological categories.


['Market Wide Requirements',
 'HR & Labour Law',
 'Market Wide Requirements',
 'Definition of Sensitive Data / Personal Information',
 'Sharing of Data Between Affiliates']

# Data processing

In [12]:
mapping = {clf:i for i,clf in enumerate(set(classifications))}
data['RegOntologyId_Labels'] = data['RegOntologyId_Clean'].apply(lambda cw : classification_label(mapping, cw))
df_split = pd.DataFrame(data['RegOntologyId_Labels'].tolist()).fillna(-1)
df_processed = pd.concat([data['CUBEJurisdiction'], df_split], axis=1)