## Generating train and test sets

**Author:** Benjamin Aw  
**Date:** 13 Dec 2021  
**Context:** Extracted data needs to be clean and split up for training and testing purposes.  
**Objective:** To apply previously generated functions to clean up the data, generate augmented data and finally to split them for training and testing purposes

#### A) Setting up

Importing the libraries and obtaining the file path for the datasets required

In [17]:
import os
os.chdir('..')

In [18]:
import pandas as pd
from ssoc_autocoder.processing import final_cleaning
from ssoc_autocoder.processing import process_text
from ssoc_autocoder.augmentation import data_augmentation
from tqdm.auto import tqdm
import math
from itertools import chain
from sklearn.model_selection import train_test_split
import copy

#path = "../Data/"
path = 'Data/'

tqdm.pandas()

#### B) Functions

We have to create 3 functions: 

1. To clean up the original data, `cleaning_data`
2. To create augmented data, `augmenting_data`
3. To create the train test split, `splitting_data`

In [3]:
def cleaning_up_dataset(path):
    
    # Reading in the data
    df = pd.read_csv(path + "Raw/Raw_Labelled.csv")
    
    # Filtering out the necessary columns used, keeping Ad_ID as primary identifier
    df = df[["MCF_Job_Ad_ID", "Predicted_SSOC_2020", "title", "description"]]
    
    # Applying cleaning function across the description column
    df["description"] = df["description"].apply(process_text)
    
    return df

In [4]:
def augmenting_data(df, prob, edit_phrase):
    
    # Applying cleaning function across the description column
    df["description_augmented"] = df["description"].progress_apply(data_augmentation, args = (prob, edit_phrase))
    
    # Renaming description column
    df = df.rename(columns={'description':'description_originial'})
    
    # Appending to the original dataset
    df = pd.concat([df, df['description_augmented'].apply(pd.Series)], axis=1)
    
    # Dropping used or repeated columns
    df = df.drop(columns="description_augmented")
    df = df.drop(columns="orginal_text")
    
    return df

In [5]:
def supplementing_data(df, data_detailed_def):
    
    # Getting the relevant columns
    data_detailed_def = data_detailed_def[["SSOC 2020", "SSOC 2020 Title", "Detailed Definitions"]]
    
    # Filtering entries with only 5D ssoc and entries that start with X
    data_detailed_def = data_detailed_def[data_detailed_def["SSOC 2020"].apply(lambda x: len(x) > 4)]
    data_detailed_def = data_detailed_def[data_detailed_def["SSOC 2020"].apply(lambda x: x[0] != 'X')]
    
    # Need to change the SSOC data type to string
    data_detailed_def['SSOC 2020'] = data_detailed_def['SSOC 2020'].astype('str')
    
    # Changing column names for merging purposes
    data_detailed_def = data_detailed_def.rename(columns={"SSOC 2020": "Predicted_SSOC_2020", 
                                                          "SSOC 2020 Title": "title", 
                                                          "Detailed Definitions": "description"}, 
                                                 errors="raise")
    
    # Creating an additional column of ID, default to None for now
    data_detailed_def["MCF_Job_Ad_ID"] = None
    
    # Concat the two datasets together
    output = df.append(data_detailed_def)
    
    output = output.sort_values(by=['Predicted_SSOC_2020', 'title'])
    
    output = output.reset_index(drop=True)
    
    return output

In [94]:
def generate_train_test(df, 
                        train_set_size, 
                        min_samples = 2,
                        augmented = False):
    
    # Enforce the correct data type
    df["Predicted_SSOC_2020"] = df["Predicted_SSOC_2020"].astype('str')
    
    # All SSOC descriptions are automatically put into the training set
    # Retain only the real job ads in df
    ssoc_desc = df[df['MCF_Job_Ad_ID'].isnull()]
    df = df[df['MCF_Job_Ad_ID'].notnull()]
    
    # Identifying which SSOCs have only one sample vs those with multiple samples
    counter = df.groupby('Predicted_SSOC_2020').count().to_dict(orient='dict')['title']
    counter_once = { key:value for (key,value) in counter.items() if value == 1}
    counter_multiple = { key:value for (key,value) in counter.items() if value > 1}
    
    # Subset out the SSOCs that have multiple samples in the dataset 
    df_multiple = df[df["Predicted_SSOC_2020"].apply(lambda x: x in counter_multiple.keys())]
    
    # Shuffle them by generating indicators for whether they are in the train or test set
    df_multiple_train, df_multiple_test = train_test_split(df_multiple, train_size = .8, random_state = 2021, stratify = df_multiple['Predicted_SSOC_2020'])
    
    # Subset out the SSOCs that only have one sample in the dataset
    df_once = df[df["Predicted_SSOC_2020"].apply(lambda x: x in counter_once.keys())]
    
    # Append once with train
    df_train = pd.concat([ssoc_desc, df_once, df_multiple_train])
    df_test = copy.deepcopy(df_multiple_test)
    
    # If we are running this on the augmented dataset
    if augmented:

        # Pivot longer df_train
        df_train = pd.melt(df_train, 
                           id_vars= ["MCF_Job_Ad_ID", 
                                     "Predicted_SSOC_2020", 
                                     "title"], 
                           value_vars=["description_originial", 
                                       "wrd_emb_out", 
                                       "bk_trans_out", 
                                       "synonym_out", 
                                       "context_emb_out", 
                                       "sent_out", 
                                       "summ_out"],
                           var_name='Augment', 
                           value_name='job_description')

        # Pivot longer for df_test, we are only interested in testing on the orginal dataset
        df_test = pd.melt(df_multiple_test, 
                           id_vars= ["MCF_Job_Ad_ID", 
                                     "Predicted_SSOC_2020", 
                                     "title"], 
                           value_vars=["description_originial", 
                                       "wrd_emb_out", 
                                       "bk_trans_out", 
                                       "synonym_out", 
                                       "context_emb_out", 
                                       "sent_out", 
                                       "summ_out"],
                           var_name='Augment', 
                           value_name='job_description')
    
        df_test = df_test[df_test["Augment"] == "description_originial"]
    
    return df_train, df_test 

#### C) Running functions


Reading in the raw csv file and subsequently cleaning up the dataset, while extacting only the necessary columns. The input of the function only requires the path of the csv file for `Raw_Labelled.csv`

In [16]:
df = cleaning_up_dataset(path)

df.to_csv(path + "Processed/Processed_Labelled.csv", index = False)

Text length below 100. Return cleaned original text.
Paragraph list detected
Text length below 50. Return cleaned original text.
Paragraph list detected
Text length below 50. Return cleaned original text.
Text length below 100. Return cleaned original text.
List object detected
None detected, returning all
List object detected
Paragraphs detected
List object detected
Text length below 50. Return cleaned original text.
Paragraph list detected
Text length below 50. Return cleaned original text.
List object detected
List object detected
Text length below 100. Return cleaned original text.
List object detected
List object detected
List object detected
List object detected
List object detected
List object detected
Paragraphs detected
List object detected
List object detected
Text length below 50. Return cleaned original text.
List object detected
List object detected
None detected, returning all
Paragraphs detected
List object detected
Paragraph list detected
Paragraph list detected
List ob

KeyboardInterrupt: 

Once the dataset is extracted, we need to supplement information of SSOCS that are not present in the current dataset. This is done by merging the dataset with `SSOC2020 Detailed Definitions.xlsx`

In [106]:
# Chunk to read in output from above cell, only do this if you have run the above function before

df = pd.read_csv(path + "Processed/Processed_Labelled.csv")

In [107]:
data_detailed_def = pd.read_excel(path + "Reference/SSOC2020 Detailed Definitions.xlsx", header = 4)

df = supplementing_data(df, data_detailed_def)

df.to_csv(path + "Reference/Intermediate_Dataset.csv", index = False)

  warn("""Cannot parse header or footer so it will be ignored""")


Because augmenting all 15k entries will take a while, we take a sample of the dataset to augment to test for now. We have to find a better way to run the augmentation, since it will take about 9-10 days to run locally on my own PC.

In [14]:
# df = df.sample(n = 200)

Augmenting the current dataset, where each row represents a job description, and the additional columns added represents the augmented data

In [91]:
# df = augmenting_data(df, 0.5, True)
# df.to_csv(path + "Processed/Processed_Augmented_Labelled_sample.csv", index = False)

Now we want to split the dataset into a train test set.

If there is only one SSOC value present in the dataset, we leave that in as a training set.

If there are more than one entry for a particular SSOC value, we split them up based on the second argument in the `train_test_split` function, and append the training section with the one above.

In [92]:
# Chunk to read in output from above cell, only do this if you have run the above function before
# df = pd.read_csv(path + "Processed/Processed_Augmented_Labelled_sample.csv")

In [108]:
train, test = generate_train_test(df, 0.8)

Check the number of SSOCs in both the train and test sets

In [109]:
for ssoc in test['Predicted_SSOC_2020'].unique():
    if ssoc not in train['Predicted_SSOC_2020'].values:
        print(ssoc)

In [110]:
print(train['Predicted_SSOC_2020'].nunique())
print(test['Predicted_SSOC_2020'].nunique())

997
382


In [112]:
train.to_csv(path + "Train/Train.csv", index = False)
test.to_csv(path + "Train/Test.csv", index = False)

#### D)

In [3]:
extra_validation = pd.read_csv('Data/Reference/MCF_JobPostings_500_ValidatedbyMRSD.csv')

In [28]:
import hashlib
extra_validation['uuid'] = [hashlib.md5(job_id.encode()).hexdigest() for job_id in extra_validation['jobid']]

In [26]:
def cleaning_text_and_check(text):
    
    cleaned_text = final_cleaning(text)
    
    # add in additional check for proper sentences
    
    return cleaned_text

def output_combined_file(path):

    df_list = []
    for filename in os.listdir(path):

        print(f'Processing {filename}...\r', end = '')
        df = pd.read_csv(path + "/" + filename)
        df_list.append(df)
    
    print('')
    combined_df = pd.concat(df_list, ignore_index = True)
    print(f'Shape of combined dataframe: {combined_df.shape}')

    return combined_df

combined_df = output_combined_file('Data/Raw/mcf_api_responses_csv')

Processing raw_2021-05.csv...
Shape of combined dataframe: (882214, 19)


In [30]:
extra_validation_full = extra_validation.merge(combined_df, how = 'left', on = 'uuid')

In [36]:
output = extra_validation_full[['jobid', 'title', 'description', 'SSOC 2020']] 
output.columns = ['MCF_Job_Ad_ID', 'title', 'description', 'Predicted_SSOC_2020']

In [38]:
output.to_csv('Data/Train/MRSD_Validation.csv', index = False)