# 2017.11.14

## finished part
    1. Use Mecab lib and convert wage column into several words
    2. Delete rows which contain vague words such as '月', '月給', 'A', and '以降'.
    3. Delete rows which contain vague numbers such as (1), (2), and some unwanted numbers such as '60日'
    4. Convert some contains in wage, such as change '時半' and ':30' into '.5', and delete minite information in time such as '00'
    5. Obtain all the numeric information in wage, and change into binary numbers, for example: change ['時給', '1137', '円', '〜', '★', '5', '-', '9', '時', '960', '円', '〜', '\n'] into '1001'. This process is to find out some special text patterns

## under processing
    1. We are finding and deleting unwanted pattern by analyzing obtained patterns in former step 5
    2. We are trying to normalize some patterns, such as '1001' and '1001001' actually are the same pattern

## future works
    1. Welete all the unwanted rows according to different pattern
    2. Output, such as convert pattern with '1001001' into 3 rows, for example: convert ['時給', '883', '円', '★', '22', '-', '翌', '5', '時', '1104', '円', '、', '5', '-', '9', '時', '890', '円', '\u3000', '◎', '昇給', 'あり', '\n'] into: first_row 9-22: 883, second row 22-5: 1104, third row 5-9: 890 


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 14 10:36:17 2017

@author: kaku
"""

import pandas as pd
import MeCab

wakachi = MeCab.Tagger("-O wakati")
def tokenize(text):
    """Retrun tokenized (Japanese Wakachi-Gaki) text

    Parameters
    ----------
    text: string
    """
    
    return wakachi.parse(text)

def wage_split_MeCab(wage):
    """
    Split wage using Mecab
    """
    wage_split = []
    for idx in range(len(wage)):
        wage_split_one = tokenize(wage[idx]).split(' ')
        wage_split.append(wage_split_one)
    return wage_split

def wage_del_vague_word(wage):
    """
    Delete rows with vague words in wage
    """
    wage_no_vague_word = []
    del_idx_vague_word = []
    for idx in range(len(wage)):
        wage_split_one = wage[idx] 
        if ('月' in wage_split_one or '月給' in wage_split_one) and ('時給' not in wage_split_one):
            del_idx_vague_word.append(idx)
        elif '以降' in wage_split_one:
            del_idx_vague_word.append(idx)
        elif 'A' in wage_split_one and 'B' in wage_split_one:
            del_idx_vague_word.append(idx)
        else:
            wage_no_vague_word.append(wage_split_one)
    return wage_no_vague_word, del_idx_vague_word

def list_num(lis):
    for x in lis:
        try:
            float(x)
            return True
        except ValueError:
            return False

def wage_del_vague_num(wage):
    """
    Delete rows with vague num in wage
    """
    wage_no_vague_num = []
    wage_no_vague_num_mid = []
    del_idx_vague_num = []   
    for i in range(len(wage)):
        wage_split_one = wage[i]
        numlist = [s for s in wage_split_one if list_num(s)]
        if '1' in numlist and '2' in numlist:
            del_idx_vague_num.append(i)
        else:
            wage_no_vague_num_mid.append(wage_split_one)
    # some useless number in specific range still need to be deleted
    for i in range(len(wage_no_vague_num_mid)):
        wage_split_one = wage_no_vague_num_mid[i]
        numlist = [s for s in wage_split_one if list_num(s)]
        for j in numlist:
            if 30 < float(j) < 600:
                wage_split_one.remove(j)
        wage_no_vague_num.append(wage_split_one)
    return wage_no_vague_num, del_idx_vague_num

def wage_convert(wage):
    """
    standardize rows in wage
    """
    wage_converted = []
    idx_converted = []
    for i in range(len(wage)):
        wage_split_one = wage[i]
        if '時半' in wage_split_one:
            idx_half = wage_split_one.index('時半')
            time_new = str(int(wage_split_one[idx_half-1])+0.5)
            del wage_split_one[idx_half]
            wage_split_one[idx_half-1] = time_new
            wage_converted.append(wage_split_one)
            idx_converted.append(i)
        elif '00' in wage_split_one:
            for j in wage_split_one:
                if j == '00':
                    wage_split_one.remove(j)
            wage_converted.append(wage_split_one)
        elif '30' in wage_split_one:
            idx_half = wage_split_one.index('30')
            if wage_split_one[idx_half-1] == ':':
                time_new = str(int(wage_split_one[idx_half-2])+0.5)
                del wage_split_one[idx_half]
                del wage_split_one[idx_half-1]
                wage_split_one[idx_half-2] = time_new
                wage_converted.append(wage_split_one)
                idx_converted.append(i)
        else:
            wage_converted.append(wage_split_one)
    return wage_converted, idx_converted

def wage_bool(wage):
    """
    convert into bool number
    """
    wage_booled = []
    for i in range(len(wage)):
        wage_split_one = wage[i]
        booled_list = [s for s in wage_split_one if list_num(s)]
        for j in range(len(booled_list)):
            if float(booled_list[j]) > 500:
                booled_list[j] = '1'
            else:
                booled_list[j] = '0'
        booled_str = ''.join(booled_list)
        wage_booled.append(booled_str)
    return wage_booled
            
data_path = '../data/wage_analysis_ver02.csv'
data = pd.read_csv(data_path, encoding = 'shift_jis')
data_todo = data[['Unnamed: 0','rqmt_cmpny_nm_txt', 'sal_txt']]
data_todo.columns = [['ID','name', 'salary']]
wage = list(data_todo.values[:,-1])

wage_split = wage_split_MeCab(wage)
wage_no_vague_word, del_idx_vague_word = wage_del_vague_word(wage_split)
wage_no_vague_num, del_idx_vague_num = wage_del_vague_num(wage_no_vague_word)
wage_converted, idx_converted = wage_convert(wage_no_vague_num)
wage_booled = wage_bool(wage_converted)

# 2017.11.21

In [1]:
import pandas as pd
import MeCab, os
import numpy as np

In [2]:
wakachi = MeCab.Tagger("-O wakati")
def tokenize(text):
    """Retrun tokenized (Japanese Wakachi-Gaki) text

    Parameters
    ----------
    text: string
    """
    
    return wakachi.parse(text)

In [3]:
def wage_split_MeCab(wage):
    """
    Split wage using Mecab
    """
    wage_split = []
    for idx in range(len(wage)):
        wage_split_one = tokenize(wage[idx]).split(' ')
        wage_split.append(wage_split_one)
    return wage_split

In [4]:
def wage_del_vague_word(wage, ID):
    """
    Delete rows with vague words in wage
    """
    id_no_vague_word = ID[:]
    wage_no_vague_word = []
    del_idx_vague_word = []
    for idx in range(len(wage)):
        wage_split_one = wage[idx] 
        if ('月' in wage_split_one or '月給' in wage_split_one) and ('時給' not in wage_split_one):
            del_idx_vague_word.append(idx)
        elif '以降' in wage_split_one:
            del_idx_vague_word.append(idx)
        elif '研修' in wage_split_one:
            del_idx_vague_word.append(idx)
        elif 'A' in wage_split_one and 'B' in wage_split_one:
            del_idx_vague_word.append(idx)
        else:
            wage_no_vague_word.append(wage_split_one)
    idx_reverse = del_idx_vague_word[::-1]
    for i in idx_reverse:    
        del id_no_vague_word[i]
    return wage_no_vague_word, id_no_vague_word

In [5]:
def list_num(lis):
    for x in lis:
        try:
            float(x)
            return True
        except ValueError:
            return False

In [6]:
def wage_del_vague_num(wage, ID):
    """
    Delete rows or digits with vague num in wage
    """
    id_no_vague_num = ID[:]
    wage_no_vague_num = []
    wage_no_vague_num_1 = []
    wage_no_vague_num_2 = []
    del_idx_vague_num= []
    
    # change digits
    for i in range(len(wage)):
        wage_split_one = wage[i]
        numlist = [s for s in wage_split_one if list_num(s)]
        for j in numlist:
            idx = wage_split_one.index(j)
            try:
                if wage_split_one[idx + 1] == 'ヶ月' or \
                wage_split_one[idx + 1] == 'ヵ月' or \
                wage_split_one[idx + 1] == '日' or \
                wage_split_one[idx + 1] == '日間' or \
                wage_split_one[idx + 1] == '週間' or \
                wage_split_one[idx + 1] == '週' or \
                wage_split_one[idx + 1] == '時間' or \
                wage_split_one[idx + 1] == '代' or \
                wage_split_one[idx + 1] == '％' or \
                wage_split_one[idx + 1] == '%' or \
                wage_split_one[idx + 1] == '.' or \
                wage_split_one[idx + 1] == '倍' or \
                wage_split_one[idx + 1] == 'h' or \
                wage_split_one[idx + 1] == '回' or \
                wage_split_one[idx + 1] == '万' or \
                wage_split_one[idx - 1] == '月給' or \
                wage_split_one[idx - 1] == '月':
                    del wage_split_one[idx]
            except:
                print(wage_split_one)
        wage_no_vague_num_1.append(wage_split_one)
    
    # delele rows
    for i in range(len(wage_no_vague_num_1)):
        wage_split_one = wage_no_vague_num_1[i]
        numlist = [s for s in wage_split_one if list_num(s)]
        if ('1' in numlist and '2' in numlist) or \
        ('1' in numlist and '3' in numlist) or \
        ('1' in numlist and '4' in numlist):
            del_idx_vague_num.append(i)
        else:
            wage_no_vague_num_2.append(wage_split_one)
    idx_reverse = del_idx_vague_num[::-1]
    for i in idx_reverse:    
        del id_no_vague_num[i]
        
    # delete digits, some useless number in specific range still need to be deleted
    for i in range(len(wage_no_vague_num_2)):
        wage_split_one = wage_no_vague_num_2[i]
        numlist = [s for s in wage_split_one if list_num(s)]
        for j in numlist:
            if 24 < float(j) < 30 or 30 < float(j) < 500:
                wage_split_one.remove(j)
        wage_no_vague_num.append(wage_split_one)
            
    return wage_no_vague_num, id_no_vague_num

In [7]:
def wage_convert(wage):
    """
    standardize rows in wage
    """
    wage_converted = []
    for i in range(len(wage)):
        wage_split_one = wage[i]
        for j in wage_split_one:
            if j == '時半':
                idx_half = wage_split_one.index('時半')
                time_new = str(int(wage_split_one[idx_half-1])+0.5)
                del wage_split_one[idx_half]
                wage_split_one[idx_half-1] = time_new
            elif j == '00':
                wage_split_one.remove(j)
            elif j == '30':
                idx_half = wage_split_one.index('30')
                try:
                    if wage_split_one[idx_half-1] == ':':
                        time_new = str(int(wage_split_one[idx_half-2])+0.5)
                        del wage_split_one[idx_half]
                        del wage_split_one[idx_half-1]
                        wage_split_one[idx_half-2] = time_new
                    else:
                        del wage_split_one[idx_half]
                except:
                    del wage_split_one[idx_half]
            else:
                continue
        wage_converted.append(wage_split_one)
    return wage_converted

In [8]:
def wage_bool(wage):
    """
    convert into bool number
    """
    wage_digit = []
    wage_booled = []
    for i in range(len(wage)):
        wage_split_one = wage[i]
        digit_list = [s for s in wage_split_one if list_num(s)]
        booled_list = digit_list[:]
        for j in range(len(booled_list)):
            if float(booled_list[j]) > 500:
                booled_list[j] = '1'
            else:
                booled_list[j] = '0'
        booled_str = ''.join(booled_list)
        wage_digit.append(digit_list)
        wage_booled.append(booled_str)
    return wage_digit, wage_booled

In [9]:
def pattern_print(wage):
    """
    print pattern and representative example
    """
    wage_unique = np.unique(list(set(wage)))
    for i in range(len(wage_unique)):
        pattern = wage_unique[i]
        pattern_idx = wage_booled.index(pattern)
        print(pattern)
        print(wage_converted[pattern_idx])

In [36]:
wage_keep = []
def check_pattern_reconstruction(wage_numlist_pattern):
    wage_numlist_pattern.columns = ['ID', 'origin', 'pattern']
    for i in range(len(wage_numlist_pattern)):
        data_for_check = wage_numlist_pattern.iloc[i, :]
        if data_for_check.pattern == '001':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M1'
            wage_keep.append(result)
        elif data_for_check.pattern == '00100001':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ',' + data_for_check.origin[5] + '-' + data_for_check.origin[6] \
            + ' ' + data_for_check.origin[7]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '00100001001':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ',' + data_for_check.origin[5] + '-' + data_for_check.origin[6] \
            + ' ' + data_for_check.origin[7] + '; ' + data_for_check.origin[8] + '-' + data_for_check.origin[9] + ' ' + data_for_check.origin[10]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M3'
            wage_keep.append(result)
        elif data_for_check.pattern == '001001':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ' ' + data_for_check.origin[5]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '001001001':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ' ' + data_for_check.origin[5] \
            + '; ' + data_for_check.origin[6] + '-' + data_for_check.origin[7] + ' ' + data_for_check.origin[8]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M3'
            wage_keep.append(result)
        elif data_for_check.pattern == '001001001001':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ' ' + data_for_check.origin[5] \
            + '; ' + data_for_check.origin[6] + '-' + data_for_check.origin[7] + ' ' + data_for_check.origin[8] \
            + '; ' + data_for_check.origin[9] + '-' + data_for_check.origin[10] + ' ' + data_for_check.origin[11]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M4'
            wage_keep.append(result)
        elif data_for_check.pattern == '0010011':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ' ' + data_for_check.origin[5] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[0] + ' ' + data_for_check.origin[6]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M3'
            wage_keep.append(result)
        elif data_for_check.pattern == '0011':
            result_ = data_for_check.origin[0] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[2] \
            + '; ' + data_for_check.origin[1] + '-' + data_for_check.origin[0] + ' ' + data_for_check.origin[3]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '1':
            result_ = '0-24 ' + data_for_check.origin[0]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M1'
            wage_keep.append(result)
        elif data_for_check.pattern == '100001':
            result_ = data_for_check.origin[4] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[1] + '-' + data_for_check.origin[2] + ',' + data_for_check.origin[3] + '-' \
            + data_for_check.origin[4] + ' ' + data_for_check.origin[5]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '10000100':
            result_ = data_for_check.origin[1] + '-' + data_for_check.origin[2] + ',' \
            + data_for_check.origin[3] + '-' + data_for_check.origin[4] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[6] + '-' + data_for_check.origin[7] + ' ' + data_for_check.origin[5]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '1001':
            result_ = data_for_check.origin[2] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[3]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '100100':
            result_ = data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[5] + ' ' + data_for_check.origin[3]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
        elif data_for_check.pattern == '100100001':
            result_ = data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[5] + ' ' + data_for_check.origin[3] \
            + '; ' + data_for_check.origin[6] + '-' + data_for_check.origin[7] + ' ' + data_for_check.origin[8]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M3'
            wage_keep.append(result)
        elif data_for_check.pattern == '1001001':
            result_ = data_for_check.origin[5] + '-' + data_for_check.origin[1] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[3] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[5] + ' ' + data_for_check.origin[6]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M3'
            wage_keep.append(result)
        elif data_for_check.pattern == '100100100':
            result_ = data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[5] + ' ' + data_for_check.origin[3] \
            + '; ' + data_for_check.origin[7] + '-' + data_for_check.origin[8] + ' ' + data_for_check.origin[6] 
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M3'
            wage_keep.append(result)
        elif data_for_check.pattern == '1001001001':
            result_ = data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[3] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[5] + ' ' + data_for_check.origin[6] \
            + '; ' + data_for_check.origin[7] + '-' + data_for_check.origin[8] + ' ' + data_for_check.origin[9] 
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M4'
            wage_keep.append(result)
        elif data_for_check.pattern == '100100100100':
            result_ = data_for_check.origin[1] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[4] + '-' + data_for_check.origin[5] + ' ' + data_for_check.origin[3] \
            + '; ' + data_for_check.origin[7] + '-' + data_for_check.origin[8] + ' ' + data_for_check.origin[6] \
            + '; ' + data_for_check.origin[10] + '-' + data_for_check.origin[11] + ' ' + data_for_check.origin[9]
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M4'
            wage_keep.append(result)
        elif data_for_check.pattern == '1100':
            result_ = data_for_check.origin[3] + '-' + data_for_check.origin[2] + ' ' + data_for_check.origin[0] \
            + '; ' + data_for_check.origin[2] + '-' + data_for_check.origin[3] + ' ' + data_for_check.origin[1] 
            result = np.str(data_for_check.ID) + 'M' + result_ + 'M2'
            wage_keep.append(result)
    wage_keep_result = pd.DataFrame(wage_keep)
    wage_keep_result.columns = ['str_']
    wage_result = wage_keep_result.str_.str.split('M').tolist()
    wage_result = pd.DataFrame(wage_result)
    wage_result.columns = ['ID', 'result', 'flag']
    wage_result.ID = wage_result.ID.astype(int)
    return wage_result

In [10]:
data_path = '../data/wage_analysis_ver02.csv'
result_path = '../result/'
show_pattern = True

In [11]:
data_ori = pd.read_csv(data_path, encoding = 'shift_jis')
data_todo = data_ori[['Unnamed: 0','rqmt_cmpny_nm_txt', 'sal_txt']]
data_todo.columns = [['ID','name', 'salary']]
ID = list(data_todo.values[:,0])
wage = list(data_todo.values[:,-1])

In [12]:
wage_split = wage_split_MeCab(wage)

In [13]:
wage_split[0]

['"',
 '時給',
 '1050',
 '円',
 '〜',
 '1313',
 '円',
 '%',
 'n',
 '※',
 '22',
 '時',
 '〜',
 '翌',
 '8',
 '時',
 'は',
 '時給',
 '1313',
 '円',
 '"',
 '\n']

In [14]:
wage_no_vague_word, id_no_vague_word = wage_del_vague_word(wage_split, ID)

In [15]:
wage_no_vague_word[0]

['"',
 '時給',
 '1050',
 '円',
 '〜',
 '1313',
 '円',
 '%',
 'n',
 '※',
 '22',
 '時',
 '〜',
 '翌',
 '8',
 '時',
 'は',
 '時給',
 '1313',
 '円',
 '"',
 '\n']

In [16]:
wage_no_vague_num, id_no_vague_num = wage_del_vague_num(wage_no_vague_word, id_no_vague_word)

In [17]:
wage_no_vague_num[0]

['"',
 '時給',
 '1050',
 '円',
 '〜',
 '1313',
 '円',
 '%',
 'n',
 '※',
 '22',
 '時',
 '〜',
 '翌',
 '8',
 '時',
 'は',
 '時給',
 '1313',
 '円',
 '"',
 '\n']

In [18]:
wage_converted = wage_convert(wage_no_vague_num)

In [19]:
wage_converted[0]

['"',
 '時給',
 '1050',
 '円',
 '〜',
 '1313',
 '円',
 '%',
 'n',
 '※',
 '22',
 '時',
 '〜',
 '翌',
 '8',
 '時',
 'は',
 '時給',
 '1313',
 '円',
 '"',
 '\n']

In [20]:
wage_digit, wage_booled = wage_bool(wage_converted)

In [23]:
set(wage_booled)

{'001',
 '00100001',
 '00100001001',
 '001001',
 '001001001',
 '001001001001',
 '00100101',
 '0010011',
 '0011',
 '0011001',
 '00110011',
 '01',
 '01001100',
 '1',
 '100',
 '100001',
 '10000100',
 '100001001',
 '10001',
 '1001',
 '100100',
 '1001000',
 '100100001',
 '10010001',
 '1001001',
 '100100100',
 '1001001001',
 '100100100100',
 '10010011',
 '100101',
 '10011',
 '10011001',
 '101',
 '1010',
 '11',
 '1100',
 '110000',
 '11001',
 '1100100',
 '110011',
 '1101',
 '111'}

In [24]:
if show_pattern:
    pattern_print(wage_booled)

001
['22', '時', '〜', '翌', '6', '時', '/', '時給', '1050', '円', '\u3000', '※', '他', '、', '下記', '\n']
00100001
['時給', '5.5', '-', '9', '時', '/', '930', '円', '、', '11', '-', '17', '時', '・', '18', '-', '22', '時', '/', '910', '円', '〜', '※', '昇給', '有', '\n']
00100001001
['時給', '7', '-', '18', '時', '/', '820', '円', '、', '18', '-', '22', '・', '5', '-', '7', '時', '/', '810', '円', '、', '22', '-', '翌', '5', '時', '/', '1013', '円', '\n']
001001
['時給', '10', '-', '16', '時', '789', '円', '、', '16', '-', '22', '時', '800', '円', '\n']
001001001
['時給', '9', '-', '22', '時', '820', '円', '〜', '★', '22', '-', '翌', '6', '時', '1100', '円', '〜', '、', '6', '-', '9', '時', '850', '円', '〜', '\n']
001001001001
['"', '時給', '\u3000', '5', '-', '9', '時', '/', '1100', '円', '\u3000', '9', '-', '17', '時', '/', '990', '円', '%', 'n', '17', '-', '22', '時', '/', '970', '円', '\u3000', '22', '-', '翌', '5', '時', '/', '1250', '円', '"', '\n']
00100101
['時給', '/', '9', '〜', '17', '時', ':', '750', '円', '\u3000', '6', '〜', '10', '時', ':',

In [25]:
data_pattern_dict = {'ID': id_no_vague_num, 'origin': wage_digit, 'pattern': wage_booled}
data_pattern = pd.DataFrame(data = data_pattern_dict )

In [26]:
data_pattern.head()

Unnamed: 0,ID,origin,pattern
0,1,"[1050, 1313, 22, 8, 1313]",11001
1,35,"[9, 22, 820, 22, 6, 1100, 6, 9, 850]",1001001
2,37,"[980, 22, 5, 1225]",1001
3,41,"[883, 22, 5, 1104, 5, 9, 890]",1001001
4,46,"[960, 22, 1200]",101


In [39]:
data_ori = data_ori.rename(columns={'Unnamed: 0':'ID'})
data_pattern_rec = check_pattern_reconstruction(data_pattern)

In [40]:
data_pattern_rec.head()

Unnamed: 0,ID,result,flag
0,35,9-22 820; 22-6 1100; 6-9 850,3
1,37,5-22 980; 22-5 1225,2
2,41,9-22 883; 22-5 1104; 5-9 890,3
3,106,9-18 800; 18-22 800; 22-5 1000; 5-9 820,4
4,110,7-22 1030; 22-7 1280,2


In [41]:
output = pd.merge(data_pattern_rec, data_ori, on = 'ID', how= 'outer')
output.to_csv(os.path.join(result_path,'output.csv'))

In [42]:
output.head()

Unnamed: 0,ID,result,flag,pre_updt_dt,rqmt_cmpny_nm_txt,jb_type_txt,sal_txt,文字数,時給フラグ,sal_txt(キーワード）除去後,...,給料タイプ,mag_addr_txt,transpo_accs_txt,after_app_prc_txt,catch_txt,read_txt,hour_sal_srt_key,dy_sal_srt_key,mnth_sal_srt_key,22chekcer
0,35,9-22 820; 22-6 1100; 6-9 850,3,2017/10/2 7:39,ファミリーマート　四日市上海老町店,[Ａ][Ｐ]ファミマスタッフ★＜土日＆早朝＞大歓迎!!,時給9-22時820円〜★22-翌6時1100円〜、6-9時850円〜,35.0,1.0,時給980円★22-翌5時1225円,...,Hourly,四日市市上海老町1901-4　※TEL後、履歴書(写貼)を持参,「フリー団地口」バス停スグ！,,,,820.0,0.0,0.0,1
1,37,5-22 980; 22-5 1225,2,2017/10/2 7:39,ファミリーマート　調布インター店,[Ａ][Ｐ]ファミマスタッフ★初バイトさん大歓迎!!,時給980円★22-翌5時1225円,18.0,1.0,時給1150円〜1400円　■規定支給%n★オープン〜2ヶ月は特別時給！★,...,Hourly,調布市下石原1-7-1　※TEL後、履歴書(写貼)を持参,「西調布駅」徒歩7分　◎車・バイク通勤ok,,,,980.0,0.0,0.0,1
2,41,9-22 883; 22-5 1104; 5-9 890,3,2017/10/9 7:07,ファミリーマート　藤井寺小山四丁目店,[Ａ][Ｐ]ファミマスタッフ★フリーター・主婦(夫)大歓迎!!,時給883円★22-翌5時1104円、5-9時890円　◎昇給あり,33.0,1.0,時給960円,...,Hourly,藤井寺市小山4-1-1　※TEL後、履歴書(写貼)を持参,近鉄「藤井寺駅」徒歩12分☆車・バイク通勤ＯＫ,,,,883.0,0.0,0.0,1
3,106,9-18 800; 18-22 800; 22-5 1000; 5-9 820,4,2017/11/4 0:33,ファミリーマートドラッグイレブン志免店,[Ａ][Ｐ]ドラッグストア販売スタッフ,"""時給800円(9〜18時)、800円(18〜22時)%n、1000円(22〜5時)、820...",54.0,1.0,時給956円,...,Hourly,811-2207 福岡県糟屋郡志免町南里４丁目１３番１号,酒殿駅より徒歩26分,応募⇒面接⇒内定。面接時は履歴書(写貼)ご持参下さい。面接から入社までは1週間〜2週間を予定...,自分に合った働き方で、無理なく続けていただける環境でお迎えします★,【充実の待遇！安心のJR九州グループ★】「ドラッグイレブン♪」でおなじみの安定企業で働こう☆...,800.0,0.0,0.0,1
4,110,7-22 1030; 22-7 1280,2,2017/10/16 7:08,ファミリーマート(1)東五反田五丁目店(2)大崎広小路店,[Ａ][Ｐ]コツコツ…忙しさを感じない!?ファミマstaff,時給1030円〜、(22時〜翌7時):時給1280円〜、他下記,31.0,1.0,時給(1)820円〜(2)(3)800円〜(4)(5)985円〜　◎,...,Hourly,(1)東五反田5-28-11(2)大崎4-1-2,(1)五反田駅東口徒歩3分(2)大崎広小路駅徒歩15秒,,"""「平日だけ」「土日のみ働きたい」…ok!!%n夕方・夜勤は品出しが中心◎%n正直、接客苦手...",,1280.0,0.0,0.0,1
