#### IMPORT MODULES

In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from read_write_util import ReadWriteUtil
from data_cleaning_pipeline import DataCleaningPipeline

In [2]:
reader = ReadWriteUtil()
data = reader.dvc_get_data('../data/job_description_train.json', 'jdtrain_v1')
data[0]

2022-09-13 19:24:26,701:logger:../data/job_description_train.json with version jdtrain_v1 Loaded


{'document': "Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience",
 'tokens': [{'text': 'Bachelor',
   'start': 0,
   'end': 8,
   'token_start': 0,
   'token_end': 0,
   'entityLabel': 'DIPLOMA'},
  {'text': 'Mechanical Engineering',
   'start': 21,
   'end': 43,
   'token_start': 4,
   'token_end': 5,
   'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': 'Physical Scie

In [3]:
test_data = reader.dvc_get_data('../data/job_description_test.json', 'jdtest_v1')
test_data[0]

2022-09-13 19:24:26,823:logger:../data/job_description_test.json with version jdtest_v1 Loaded


{'document': '\nCurrently holding a faculty, industry, or government researcher position.\nPh.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields.\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in theoretical and empirical research and for answering questions with research.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \n1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research.\nExperience driving original scholarship in collaboration with a team.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperie

#### Clean Train JSON

In [4]:
clean_pipeline = DataCleaningPipeline()
#Initialize our cleaning pipeline

#### Check for irrelevant key:value pairs

In [5]:
data[0]['tokens'][:2]
#As we can see the key:value pairs:
    # 'start': 0,
    # 'end': 8,
    # 'token_start': 0,
    # 'token_end': 0,
# are irrelevant for our use case, we only focus on the key:value pairs:
    # 'text': 'Bachelor',
    # 'entityLabel': 'DIPLOMA'
# So in our cleaning pipline we will create a function to remove the above irrelevant key:value pairs

[{'text': 'Bachelor',
  'start': 0,
  'end': 8,
  'token_start': 0,
  'token_end': 0,
  'entityLabel': 'DIPLOMA'},
 {'text': 'Mechanical Engineering',
  'start': 21,
  'end': 43,
  'token_start': 4,
  'token_end': 5,
  'entityLabel': 'DIPLOMA_MAJOR'}]

In [6]:
data[0]['relations']
#It seems we have another irrelevant key:value pair in our job description data
# We will remove it using the function we create to remove the irrelevant columns

[{'child': 4, 'head': 0, 'relationLabel': 'DEGREE_IN'},
 {'child': 7, 'head': 0, 'relationLabel': 'DEGREE_IN'},
 {'child': 15, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'},
 {'child': 18, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'},
 {'child': 22, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}]

In [7]:
clean_pipeline.runpipeline(data, irrelevant_data_keys=['relations'], irrelevant_token_keys=['start', 'end', 'token_start', 'token_end'])
data[0]
#We pass our data to our pipeline in order to remove unwanted key:value pairs

2022-09-13 19:24:29,003:logger:Cleaned Irrelevant key:value pairs in tokens.


{'document': "Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience",
 'tokens': [{'text': 'Bachelor', 'entityLabel': 'DIPLOMA'},
  {'text': 'Mechanical Engineering', 'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': 'Physical Science', 'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': '3+ years', 'entityLabel': 'EXPERIENCE'},
  {'text': 'developing', 'entityLabel': 'SKILLS'},
 

#### Save and version data accordingly

In [8]:
import json

with open("../data/job_description_train_cleaned.json", 'w') as f:
    f.write(json.dumps(data, indent=2))

#### Verify versioned data

In [9]:
cleaned_data = reader.dvc_get_data('../data/job_description_train_cleaned.json', 'jdtrain_v1_cleaned')
cleaned_data[0]

2022-09-13 19:31:33,902:logger:../data/job_description_train_cleaned.json with version jdtrain_v1_cleaned Loaded


{'document': "Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience",
 'tokens': [{'text': 'Bachelor', 'entityLabel': 'DIPLOMA'},
  {'text': 'Mechanical Engineering', 'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': 'Physical Science', 'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': '3+ years', 'entityLabel': 'EXPERIENCE'},
  {'text': 'developing', 'entityLabel': 'SKILLS'},
 

#### Clean and Version Test File

In [10]:
clean_pipeline.runpipeline(test_data, irrelevant_data_keys=['relations'], irrelevant_token_keys=['start', 'end', 'token_start', 'token_end'])
test_data[0]
#Apply the same pipeline to our test json

2022-09-13 19:34:36,807:logger:Cleaned Irrelevant key:value pairs in tokens.


{'document': '\nCurrently holding a faculty, industry, or government researcher position.\nPh.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields.\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in theoretical and empirical research and for answering questions with research.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \n1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research.\nExperience driving original scholarship in collaboration with a team.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperie

#### Save and version data accordingly

In [11]:
with open("../data/job_description_test_cleaned.json", 'w') as f:
    f.write(json.dumps(test_data, indent=2))

In [12]:
cleaned_test_data = reader.dvc_get_data('../data/job_description_test_cleaned.json', 'jdtest_v1_cleaned')
cleaned_test_data[0]

2022-09-13 19:37:39,908:logger:../data/job_description_test_cleaned.json with version jdtest_v1_cleaned Loaded


{'document': '\nCurrently holding a faculty, industry, or government researcher position.\nPh.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields.\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in theoretical and empirical research and for answering questions with research.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \n1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research.\nExperience driving original scholarship in collaboration with a team.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperie