In [None]:
import os
import re
import sys
import random
import time
import json
import farmhash # https://github.com/veelion/python-farmhash
import numpy as np
import pandas as pd
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
def train_test_partition(data, hash_column = 'text', train_pct = 0.8, 
                         partition_names = ['Train', 'Test'], seed = 43):
    set_seed(seed)
    threshold = int(train_pct*100)
    data = data.copy()
    partition_hash = data[hash_column].apply(lambda x: farmhash.hash64withseed(x, seed))
    partition = np.abs(partition_hash % 100)
    partition = np.where(partition>=threshold, partition_names[1], partition_names[0])
    return partition

In [None]:
pd.set_option("max_rows", 999)
pd.set_option("max_columns", 999)

In [None]:
seed = 43
set_seed(seed)

In [None]:
task = "D0"
date_tag = "2021_05_17"
data_dir = f"/hub/CA-MTL/data/{task}"
file = f"/hub/311_text_classifier/data/raw/PW-{task}-{date_tag}-PROD.csv"

In [None]:
out_dir = f"{data_dir}/{date_tag}"
train_file_out = f"{out_dir}/train.tsv"
train_dev_file_out = f"{out_dir}/train-dev.tsv"
dev_file_out = f"{out_dir}/dev.tsv"
test_file_out = f"{out_dir}/test.tsv"
metadata_file_out = f"{out_dir}/metadata.json"

In [None]:
metadata = dict(
    raw_data_file = file, 
    data_version = date_tag, 
    task_name = task, 
    file_paths = {
        'train':train_file_out, 
        'train-dev':train_dev_file_out, 
        'dev':dev_file_out, 
        'test':test_file_out
    },
    partition_rules = [
        'external/daupler seperate; train/train_dev 0.85/0.15; dev/test 0.5/0.5'
    ]
)

In [None]:
try:
    os.mkdir(out_dir)
except OSError as error:
    print("Directory already exists")
    pass

Read data and remove all tabs, multi-spaces, and new lines

In [None]:
data = pd.read_csv(file)

In [None]:
def remove_tabs_newlines(x):
    return re.sub(r"[\n\t\r]*", "", x)
def remove_multi_spaces(x):
    return re.sub(r"\s\s+", " ", x)
data['text'] = data['text'].apply(remove_tabs_newlines)
data['text'] = data['text'].apply(remove_multi_spaces)
data = data.drop_duplicates('text').reset_index(drop=True)

Remap categories

In [None]:
if task == 'D1':
    remap_condition = (data['D1_category'] == 'Water Meter Issue')
    data['D1_category'] = np.where(remap_condition, 'Meter Issue', data['D1_category'])

Split and process

In [None]:
condition = data['daupler_generated']==1
dau = data[condition].reset_index(drop=True)
ext = data[~condition].reset_index(drop=True)

In [None]:
print(data.shape)
print(dau.shape)
print(ext.shape)

Partition External Data in Train and Train-Dev

In [None]:
ext['partition'] = train_test_partition(
    ext, hash_column = 'text', train_pct = 0.85, 
    partition_names = ['Train', 'Train-Dev'], seed = seed)

In [None]:
train_condition = ext['partition']=='Train'
train = ext[train_condition].reset_index(drop=True)
train_dev = ext[~train_condition].reset_index(drop=True)

In [None]:
# ext.groupby(['category', 'partition']).size().unstack().fillna(0).astype(int)

Partition Daupler Data in Dev and Test

In [None]:
dau['partition'] = train_test_partition(
    dau, hash_column = 'text', train_pct = 0.50, 
    partition_names = ['Dev', 'Test'], seed = seed)

In [None]:
dev_condition = dau['partition']=='Dev'
dev = dau[dev_condition].reset_index(drop=True)
test = dau[~dev_condition].reset_index(drop=True)

In [None]:
# dau.groupby(['category', 'partition']).size().unstack().fillna(0).astype(int)

In [None]:
for text in dev[dev['text'].str.contains('The caller hit a couple pot holes on 3205 Martin Way E, Olympia, WA 98506')].text:
    print(text)

Generate Metadata

In [None]:
metadata['labels'] = data['category'].sort_values().unique().tolist()

In [None]:
ext

In [None]:
out_cols = {
    'D0':[
        'text',
        'category',
        'internal_id',
        'external_id'
    ],
    'D1':[
        'text',
        'category',
        'internal_id',
        'external_id'
    ],
}

In [None]:
train[out_cols[task]].to_csv(train_file_out,sep='\t',index=False)
train_dev[out_cols[task]].to_csv(train_dev_file_out,sep='\t',index=False)
dev[out_cols[task]].to_csv(dev_file_out,sep='\t',index=False)
test[out_cols[task]].to_csv(test_file_out,sep='\t',index=False)
json.dump(metadata, open(metadata_file_out, 'w'))