In [1]:
"""
@important note: 
    the script is altered by ChenglongChen 3rd place solution for HomeDepot product search 
    results relevance competition on Kaggle.
@author: Eric Tsai <eric492718@gmail.com>
@brief: process data
        - a bunck of processing
        - automated spelling correction
        - query expansion
        - extract product name for search_term and product_title

"""

'\n@important note: \n    the script is altered by ChenglongChen 3rd place solution for HomeDepot product search \n    results relevance competition on Kaggle.\n@author: Eric Tsai <eric492718@gmail.com>\n@brief: process data\n        - a bunck of processing\n        - automated spelling correction\n        - query expansion\n        - extract product name for search_term and product_title\n\n'

In [2]:
# basic libraries
import numpy as np
import pandas as pd

# utils
import csv  
from importlib.machinery import SourceFileLoader
import multiprocessing
from bs4 import BeautifulSoup  # 處理 html tag
from collections import Counter

# NLP
import nltk  # 一套基於 Python 的自然語言處理工具箱
import regex  # it is necessary because using the re module will get an error

# libraries we write
import config
from utils import ngram_utils, pkl_utils, logging_utils, time_utils
from spelling_checker import GoogleQuerySpellingChecker, AutoSpellingChecker

In [3]:
# to use the multiprocessing package, I must put all related function in the script file, and import it
import parallel_processing as pp

<div class="alert alert-warning" role="alert">
  <strong>Note!</strong> <br>在Python的string前面加上‘r’， 是為了告訴編譯器這個string是個raw string，不要轉意backslash '\' 。 例如，\n 在raw string中，是兩個字元，\和n， 而不會轉意為換行符。由於正則表示式和 \ 會有衝突，因此，當一個字串使用了正則表示式後，最好在前面加上'r'。
</div>

<pre>
                   BaseReplacer()  ==>   Attribute: pattern_replace_pair_list=[], Method: transform(text)
                         /                   \
                        /                     \
              LowerCaseConverter      LowerUpperCaseSplitter
             (method overriding)      (attribute overriding)
                        \                     /
                         \                   /
                                   C        
</pre>

In [4]:
#--------------------------- Processor ---------------------------
## base class
## Most of the processings can be casted into the "pattern-replace" framework
class BaseReplacer:
    '''
    re.sub(pattern,repl,string,count)
    pattern: pattern which we want to replace. It write by regular expression.
    repl:replacer
    string: the string we need to deal with
    count:number of match pattern that we want to replace. If I choose 0, it means replacing all the targets.
    '''
    def __init__(self, pattern_replace_pair_list=[]):
        self.pattern_replace_pair_list = pattern_replace_pair_list
    def transform(self, text):
        for pattern, replace in self.pattern_replace_pair_list:
            try:
                text = regex.sub(pattern, replace, text)
            except ValueError:
                print(ValueError)
        return regex.sub(r'\s+', ' ', text).strip() # \s+ : means "one or more spaces" replace to one space

In [5]:
# text = 'Eric  Tsai Tsai Tsai Tsai is good'
# pattern_replace_pair_list = [('Tsai', 'Eric'), ('good', 'bad')]
# print(text)
# BaseReplacer(pattern_replace_pair_list).transform(text)

In [6]:
def test_BaseReplacer():
    text = 'Eric  Tsai Tsai Tsai Tsai is good'
    pattern_replace_pair_list = [('Tsai', 'Eric'), ('good', 'bad')]
    assert BaseReplacer(pattern_replace_pair_list).transform(text)=='Eric Eric Eric Eric Eric is bad'

In [7]:
## deal with case
# Inheritance BaseReplacer Attribute and Method
class LowerCaseConverter(BaseReplacer):
    """
    Traditional -> traditional
    """
    # Method Overriding (方法覆寫)，覆寫從 BaseReplacer 繼承的方法
    def transform(self, text):
        return text.lower()

In [8]:
# text = 'EricTsai Tsai Tsai Tsai is good'
# print(text)
# LowerCaseConverter().transform(text)

In [9]:
def test_LowerCaseConverter():
    text = 'EricTsai Tsai Tsai Tsai is good'
    assert LowerCaseConverter().transform(text) == 'erictsai tsai tsai tsai is good'

In [10]:
class LowerUpperCaseSplitter(BaseReplacer):
    """
    homeBASICS Traditional Real Wood -> homeBASICS Traditional Real Wood

    hidden from viewDurable rich finishLimited lifetime warrantyEncapsulated panels ->
    hidden from view Durable rich finish limited lifetime warranty Encapsulated panels

    Dickies quality has been built into every product.Excellent visibilityDurable ->
    Dickies quality has been built into every product Excellent visibility Durable

    BAD CASE:
    shadeMature height: 36 in. - 48 in.Mature width
    minutesCovers up to 120 sq. ft.Cleans up
    PUT one UnitConverter before LowerUpperCaseSplitter

    Reference:
    https://www.kaggle.com/c/home-depot-product-search-relevance/forums/t/18472/typos-in-the-product-descriptions
    """
    def __init__(self):
        ########################################################################
        # The first regular expression can solve the problem which is a 
        # sentence connect with the other sentence but without any character.
        ########################################################################
        # The second regular expression means: a Blank characters, followed 
        # by the low case letter, followed by a upper case letter character.
        self.pattern_replace_pair_list = [(r'(\w)[\.?!]([A-Z])', r'\1 \2'), # \1: means the group 1 element. In this case, it is items which match with pattern (\w)  
                                          (r'(?<=( ))([a-z]+)([A-Z]+)', r'\2 \3'),]  # \2: means the group 2 element


In [11]:
# # show some example
# display( LowerUpperCaseSplitter().transform('homeBASICS Traditional Real Wood') )
# display( LowerUpperCaseSplitter().transform('hidden from viewDurable rich finishLimited lifetime warrantyEncapsulated panels') )
# ##############################################################################################################################
# display( LowerUpperCaseSplitter().transform('shadeMature height: 36 in. - 48 in.Mature width') )
# # if add a blank character in the beginning, the function will work 
# display( LowerUpperCaseSplitter().transform(' shadeMature height: 36 in. - 48 in.Mature width') )

In [12]:
def test_LowerUpperCaseSplitter():
    assert (LowerUpperCaseSplitter().transform('homeBASICS Traditional Real Wood')
            == 'homeBASICS Traditional Real Wood')
    assert (LowerUpperCaseSplitter().transform('hidden from viewDurable rich finishLimited lifetime warrantyEncapsulated panels') 
            == 'hidden from view Durable rich finish Limited lifetime warranty Encapsulated panels')
    assert (LowerUpperCaseSplitter().transform('shadeMature height: 36 in. - 48 in.Mature width')
            == 'shadeMature height: 36 in. - 48 in Mature width')
    assert (LowerUpperCaseSplitter().transform(' shadeMature height: 36 in. - 48 in.Mature width')
           =='shade Mature height: 36 in. - 48 in Mature width')

In [13]:
'''
Create word replacement patterns, using homemade replacement words.
Input will be a CSV file that contains one column and each value is
a text which shows the words, followed by ",", and followed by a word we
want to replace. Note, texts is annotated if it begins with '#' character.
'''
class WordReplacer(BaseReplacer):
    '''
    if words are not near the [a-z0-9_](\W or ^\w), 
    replce it by replacement dictionary
    '''
    def __init__(self, replace_fname):
        self.replace_fname = replace_fname # file name which is alread create in config
        self.pattern_replace_pair_list = []
        for line in csv.reader(open(self.replace_fname)): # use csv.reader will return a list which seperate by ","
            if len(line) == 1 or line[0].startswith('#'):
                continue # The continue statement is used to skip the rest of the code inside a loop for the current iteration only. Loop does not terminate but continues on with the next iteration. 
            try: # Regular Expression means: a text is between two characters which are non-alphanumeric characters 
                pattern = r'(?<=\W|^)%s(?=\W|$)'%line[0] # or a text is first character in the string (^)
                replace = line[1]                        # or a text is the end of the string ($)
                self.pattern_replace_pair_list.append( (pattern, replace) )
            except:
                print(line)
                pass

In [14]:
# # display how to use csv.reader
# i = -1
# for line in csv.reader(open('../Data/dict/word_replacer.csv')):
#     i=i+1
#     if i == 1:
#         print(line)

In [15]:
# replace_list = WordReplacer(replace_fname=config.WORD_REPLACER_DATA).pattern_replace_pair_list
# replace_list[0]

In [16]:
# # show some example
# text1 = regex.sub(replace_list[0][0], replace_list[0][1],'Eric Tsai want a undercabinet')
# text2 = regex.sub(replace_list[0][0], replace_list[0][1],'Eric Tsai want a 2undercabinet')
# text3 = regex.sub(replace_list[0][0], replace_list[0][1],'Eric Tsai want a $undercabinet')
# text4 = regex.sub(replace_list[0][0], replace_list[0][1],'undercabinet is what you need')
# text5 = regex.sub(replace_list[0][0], replace_list[0][1],'The undercabinet is what you need')
# text6 = regex.sub(replace_list[0][0], replace_list[0][1],'*undercabinet is what you need')
# text7 = regex.sub(replace_list[0][0], replace_list[0][1],'undercabinet% is what you need')
# text8 = regex.sub(replace_list[0][0], replace_list[0][1],'undercabinet_ is what you need')
# display(text1, text2, text3, text4, text5, text6, text7, text8)

In [17]:
def test_WordReplacer():
    replace_list = WordReplacer(replace_fname=config.WORD_REPLACER_DATA).pattern_replace_pair_list
    assert regex.sub(replace_list[0][0], replace_list[0][1],'Eric Tsai want a undercabinet') == 'Eric Tsai want a under cabinet'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'Eric Tsai want a 2undercabinet') == 'Eric Tsai want a 2undercabinet'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'Eric Tsai want a $undercabinet') == 'Eric Tsai want a $under cabinet'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'undercabinet is what you need') == 'under cabinet is what you need'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'The undercabinet is what you need') == 'The under cabinet is what you need'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'*undercabinet is what you need') == '*under cabinet is what you need'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'undercabinet% is what you need') == 'under cabinet% is what you need'
    assert regex.sub(replace_list[0][0], replace_list[0][1],'undercabinet_ is what you need') == 'undercabinet_ is what you need'

<code style="background:yellow;color:black">***The class below has a bug in Chenglong version. The pattern was wrong. But I already took care of it. The original version can't deal with many '-' characters in the text like 'Vinyl-Leather-Rubber'.***</code>

In [18]:
## deal with letters
class LetterLetterSplitter(BaseReplacer):
    """
    For letter and letter
    /:
    Cleaner/Conditioner -> Cleaner Conditioner

    -:
    Vinyl-Leather-Rubber -> Vinyl Leather Rubber

    For digit and digit, we keep it as we will generate some features via math operations,
    such as approximate height/width/area etc.
    /:
    3/4 -> 3/4

    -:
    1-1/4 -> 1-1/4
    """
    def __init__(self):
        self.pattern_replace_pair_list = [
            (r'(?<=[a-zA-Z])[/\-](?=[a-zA-Z])', r' ')
        ]


In [19]:
# # show some example
# display(
#     LetterLetterSplitter().transform('Cleaner/Conditioner'),
#     LetterLetterSplitter().transform('Vinyl-Leather-Rubber'),
#     LetterLetterSplitter().transform('Vinyl-Leather/Rubber'),
#     LetterLetterSplitter().transform('COVID-19 is crazy'),
#     LetterLetterSplitter().transform('3/4'),
#     LetterLetterSplitter().transform('1-1/4'),
# )

In [20]:
def test_LetterLetterSplitter():
    assert LetterLetterSplitter().transform('Cleaner/Conditioner') == 'Cleaner Conditioner'
    assert LetterLetterSplitter().transform('Vinyl-Leather-Rubber') == 'Vinyl Leather Rubber'
    assert LetterLetterSplitter().transform('Vinyl-Leather/Rubber') == 'Vinyl Leather Rubber'
    assert LetterLetterSplitter().transform('COVID-19 is crazy') == 'COVID-19 is crazy'
    assert LetterLetterSplitter().transform('3/4') == '3/4'
    assert LetterLetterSplitter().transform('1-1/4') == '1-1/4'

In [21]:
## deal with digits and numbers
class DigitLetterSplitter(BaseReplacer):
    """
    x:
    1x1x1x1x1 -> 1 x 1 x 1 x 1 x 1
    19.875x31.5x1 -> 19.875 x 31.5 x 1

    -:
    1-Gang -> 1 Gang
    48-Light -> 48 Light

    .:
    includes a tile flange to further simplify installation.60 in. L x 36 in. W x 20 in. ->
    includes a tile flange to further simplify installation. 60 in. L x 36 in. W x 20 in.
    """
    
    def __init__(self):
        self.pattern_replace_pair_list = [
            (r'(\d+)[\.\-]*([a-zA-Z]+)', r'\1 \2'),
            (r'([a-zA-Z]+)[\.\-]*(\d+)', r'\1 \2'),
        ]


In [22]:
# # show some example
# display(
#     DigitLetterSplitter().transform('1.a'),
#     DigitLetterSplitter().transform('1x'),
#     DigitLetterSplitter().transform('1x1x1x1x1'),
#     DigitLetterSplitter().transform('1-Gang'),
#     DigitLetterSplitter().transform('COVID-19 is crazy'),
#     DigitLetterSplitter().transform('3/4'),
#     DigitLetterSplitter().transform('1-1/4'),
# )

In [23]:
def test_DigitLetterSplitter():
    assert DigitLetterSplitter().transform('1.a') == '1 a'
    assert DigitLetterSplitter().transform('1x') == '1 x'
    assert DigitLetterSplitter().transform('1x1x1x1x1') == '1 x 1 x 1 x 1 x 1'
    assert DigitLetterSplitter().transform('1-Gang') == '1 Gang'
    assert DigitLetterSplitter().transform('COVID-19 is crazy') == 'COVID 19 is crazy'
    assert DigitLetterSplitter().transform('3/4') == '3/4'
    assert DigitLetterSplitter().transform('1-1/4') == '1-1/4'

In [24]:
class DigitCommaDigitMerger(BaseReplacer):
    """
    1,000,000 -> 1000000
    """
    def __init__(self):
        self.pattern_replace_pair_list = [
            (r"(?<=\d+),(?=000)", r""),
        ]

In [25]:
# # show some example
# display(
#     DigitCommaDigitMerger().transform('1,000,000'),
#     DigitCommaDigitMerger().transform('900,000'),
#     DigitCommaDigitMerger().transform('80,000'),
# )

In [26]:
def test_DigitCommaDigitMerger():
    assert DigitCommaDigitMerger().transform('1,000,000') == '1000000'
    assert DigitCommaDigitMerger().transform('900,000') == '900000'
    assert DigitCommaDigitMerger().transform('80,000') == '80000'

In [27]:
class NumberDigitMapper(BaseReplacer):
    """
    one -> 1
    two -> 2
    """
    def __init__(self):
        numbers = [
            'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
            'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen',
            'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'
        ]
        digits = [
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90
        ]
        self.pattern_replace_pair_list = [
            (r'(?<=\W|^)%s(?=\W|$)'%n, str(d)) for n,d in zip(numbers, digits)
        ]

In [28]:
# # show some example
# display(
#     NumberDigitMapper().transform('one'),
#     NumberDigitMapper().transform('ten'),
#     NumberDigitMapper().transform('fifty'),
# )

In [29]:
def test_NumberDigitMapper():
    assert NumberDigitMapper().transform('one') == '1'
    assert NumberDigitMapper().transform('ten') == '10'
    assert NumberDigitMapper().transform('fifty') == '50'

<code style="background:yellow;color:black">***The class below has a bug in Chenglong version. The pattern of regular expression was wrong. But I already took care of it. The original version can't deal with words that contain 'in' like 100 instergram accounts. I already fixed it. But still can't deal with digital followed by an adposition 'in'. Like 'What is 100 in Chinese?'***</code>

In [30]:
## deal with unit
class UnitConverter(BaseReplacer):
    """
    shadeMature height: 36 in. - 48 in.Mature width
    PUT one UnitConverter before LowerUpperCaseSplitter
    """
    def __init__(self):
        self.pattern_replace_pair_list = [
            (r"([0-9]+)( *)(inches|inch|in|in\.|')(?=[^\w]+)\.?", r"\1 in. "),
            (r"([0-9]+)( *)(pounds|pound|lbs|lb|lb\.)(?=[^\w]+)\.?", r"\1 lb. "),
            (r"([0-9]+)( *)(foot|feet|ft|ft\.|'')(?=[^\w]+)\.?", r"\1 ft. "),
            (r"([0-9]+)( *)(square|sq|sq\.) ?\.?(inches|inch|in|in.|')(?=[^\w]+)\.?", r"\1 sq.in. "),
            (r"([0-9]+)( *)(square|sq|sq\.) ?\.?(feet|foot|ft|ft.|'')(?=[^\w]+)\.?", r"\1 sq.ft. "),
            (r"([0-9]+)( *)(cubic|cu|cu\.) ?\.?(inches|inch|in|in.|')(?=[^\w]+)\.?", r"\1 cu.in. "),
            (r"([0-9]+)( *)(cubic|cu|cu\.) ?\.?(feet|foot|ft|ft.|'')(?=[^\w]+)\.?", r"\1 cu.ft. "),
            (r"([0-9]+)( *)(gallons|gallon|gal)(?=[^\w]+)\.?", r"\1 gal. "),
            (r"([0-9]+)( *)(ounces|ounce|oz)(?=[^\w]+)\.?", r"\1 oz. "),
            (r"([0-9]+)( *)(centimeters|cm)(?=[^\w]+)\.?", r"\1 cm. "),
            (r"([0-9]+)( *)(milimeters|mm)(?=[^\w]+)\.?", r"\1 mm. "),
            (r"([0-9]+)( *)(minutes|minute)(?=[^\w]+)\.??", r"\1 min. "),
            (r"([0-9]+)( *)(°|degrees|degree)(?=[^\w]+)\.?", r"\1 deg. "),
            (r"([0-9]+)( *)(v|volts|volt)(?=[^\w]+)(?=[^\w]+)\.?", r"\1 volt. "),
            (r"([0-9]+)( *)(wattage|watts|watt)(?=[^\w]+)\.?", r"\1 watt. "),
            (r"([0-9]+)( *)(amperes|ampere|amps|amp)(?=[^\w]+)\.?", r"\1 amp. "),
            (r"([0-9]+)( *)(qquart|quart)(?=[^\w]+)\.?", r"\1 qt. "),
            (r"([0-9]+)( *)(hours|hour|hrs\.)(?=[^\w]+)\.?", r"\1 hr "),
            (r"([0-9]+)( *)(gallons per minute|gallon per minute|gal per minute|gallons/min.|gallons/min)(?=[^\w]+)\.?", r"\1 gal. per min. "),
            (r"([0-9]+)( *)(gallons per hour|gallon per hour|gal per hour|gallons/hour|gallons/hr)(?=[^\w]+)\.?", r"\1 gal. per hr "),
        ]


In [31]:
# # show some example
# display(
#     UnitConverter().transform('shadeMature height: 36 in. - 48 in.Mature width'),
#     UnitConverter().transform('shadeMature height: 36 in - 48 in.Mature width'),
#     UnitConverter().transform('shadeMature height: 36 inch - 48 in.Mature width'),
#     UnitConverter().transform('shadeMature height: 36in. - 48in.Mature width'),
#     UnitConverter().transform('shadeMature height: 36in - 48in Mature width'),
#     UnitConverter().transform('shadeMature height: 36inyy - 48in Mature width'),
#     UnitConverter().transform("shadeMature height: 36' - 48in Mature width"),  # "inch" abbreviation:  "'" or in
#     UnitConverter().transform('Top 100 instagram Business Accounts sorted by Followers'),  # this case is not well
#     UnitConverter().transform('100 inches is long'),
#     UnitConverter().transform('100 inches is 50 cmiii uu'),
#     UnitConverter().transform('100 vampire is cool'),
# )

In [32]:
def test_UnitConverter():
    assert UnitConverter().transform('shadeMature height: 36 in. - 48 in.Mature width') == 'shadeMature height: 36 in. - 48 in. Mature width'
    assert UnitConverter().transform('shadeMature height: 36 in - 48 in.Mature width') == 'shadeMature height: 36 in. - 48 in. Mature width'
    assert UnitConverter().transform('shadeMature height: 36 inch - 48 in.Mature width') == 'shadeMature height: 36 in. - 48 in. Mature width'
    assert UnitConverter().transform('shadeMature height: 36in. - 48in.Mature width') == 'shadeMature height: 36 in. - 48 in. Mature width'
    assert UnitConverter().transform('shadeMature height: 36in - 48in Mature width') == 'shadeMature height: 36 in. - 48 in. Mature width'
    assert UnitConverter().transform('shadeMature height: 36inyy - 48in Mature width') == 'shadeMature height: 36inyy - 48 in. Mature width'
    assert UnitConverter().transform("shadeMature height: 36' - 48in Mature width") == 'shadeMature height: 36 in. - 48 in. Mature width'
    assert UnitConverter().transform('Top 100 instagram Business Accounts sorted by Followers') == 'Top 100 instagram Business Accounts sorted by Followers'
    assert UnitConverter().transform('100 inches is long') == '100 in. is long'
    assert UnitConverter().transform('100 inches is 50 cmiii uu') == '100 in. is 50 cmiii uu'
    assert UnitConverter().transform('100 vampire is cool') == '100 vampire is cool'

## Test why `UnitConverter()` must be convert text before using `LowerUpperCaseSplitter()`

<code style="background:yellow;color:black">***You can find out if poundFit(assume is a brand name) is a word, using `UnitConverter()` won't convert 'pound' to unit.***</code>

### Case1. `Using UnitConverter()` first
```python
text = 'shadeMature height: 36 in. - 48 poundFit width'
text2 = UnitConverter().transform(text)
text3 = LowerUpperCaseSplitter().transform(text2)
display(text2, text3)

output:
    'shadeMature height: 36 in. - 48 poundFit width'
    'shadeMature height: 36 in. - 48 pound Fit width
```

### Case2. `LowerUpperCaseSplitter()` first
```python
text = 'shadeMature height: 36 in. - 48 poundFit width'
text2 = LowerUpperCaseSplitter().transform(text)
text3 = UnitConverter().transform(text2)

output:
    'shadeMature height: 36 in. - 48 pound Fit width'
    'shadeMature height: 36 in. - 48 lb. Fit width'   
```

In [33]:
## deal with html tags
class HtmlCleaner:
    def __init__(self, parser):
        self.parser = parser  # 'html.parser'

    def transform(self, text):
        bs = BeautifulSoup(text, self.parser)
        text = bs.get_text(separator=' ')
        return text


In [34]:
# # show some example
# text = '''<p>Hi. This is a simple example.<br>Yet poweful one.<p><a href="http://example.com/">I linked to <i>example.com</i></a>'''
# HtmlCleaner(parser='html.parser').transform(text)

In [35]:
def test_HtmlCleaner():
    text = '''<p>Hi. This is a simple example.<br>Yet poweful one.<p><a href="http://example.com/">I linked to <i>example.com</i></a>'''
    assert HtmlCleaner(parser='html.parser').transform(text) == 'Hi. This is a simple example. Yet poweful one. I linked to  example.com'

In [36]:
## deal with some special characters
# 3rd solution in CrowdFlower (Create by Chenglong)
class QuartetCleaner(BaseReplacer):
    def __init__(self):
        self.pattern_replace_pair_list = [
            (r'<.+?>', r''),  # <at least one characters)
            # html codes（character entities）
            (r'&nbsp;', r' '),  # Non-Breakable Space
            (r'&amp;', r'&'),  # &amp; is the character reference for "An ampersand".
            (r'&#39;', r"'"),
            (r'/>/Agt/>', r''),
            (r'</a<gt/', r''),
            (r'gt/>', r''),
            (r'/>', r''),
            (r'<br', r''),
            # do not remove ['.', '/', '-', '%'] as they are useful in numbers, e.g., 1.97, 1-1/2, 10%, etc.
            (r'[&<>)(_,;:!?\+^~@#\$]+', r' '),  # in the Chenglong, will remove space, but in this case no need
            ("'s\\b", r''),
            (r"[']+", r''),
            (r'[\"]+', r''),
        ]


In [37]:
def test_QuartetCleaner():
    '''
    unit test for specific tasks and whole function
    '''
    # specific task
    text = '<remove me> Hello my friends!<a>'
    assert regex.sub(r'<.+?>', r'', text) == ' Hello my friends!'
    text = 'Hello&nbsp;my friends!<a>'
    assert regex.sub(r'&nbsp;', r' ', text) == 'Hello my friends!<a>'
    text = 'Hello my friends &amp; mentor'
    assert regex.sub(r'&amp;', r'&', text) == 'Hello my friends & mentor'
    text = 'I&#39;m a man'
    assert regex.sub(r'&#39;', r"'", text) == "I'm a man"
    text = '/>/Agt/> must be remove'
    assert regex.sub(r'/>/Agt/>', r'', text) == ' must be remove'
    text = '</a<gt/ must be remove'
    assert regex.sub(r'</a<gt/', r'', text) == ' must be remove'
    text = 'gt/> must be remove'
    assert regex.sub(r'gt/>', r'', text) == ' must be remove'
    text = '/> must be remove'
    assert regex.sub(r'/>', r'', text) == ' must be remove'
    text = '<br must be remove'
    assert regex.sub(r'<br', r'', text) == ' must be remove'
    text = '&<>)(_,;:!?+^~@#$ must be remove'
    assert regex.sub(r'[&<>)(_,;:!?\+^~@#\$]+', r'', text) == ' must be remove'
    text = "'s\\b must be remove"
    assert regex.sub(r"'s\\b", r'', text) == ' must be remove'
    text = "I'm Eric"
    assert regex.sub(r"[']+", r'', text) == 'Im Eric'
    text = '"Eric Tsai" is my name.'
    assert regex.sub(r'[\"]+', r'', text) == 'Eric Tsai is my name.'
    # whole function
    text = '<remove me> Hello my friends!<a>'
    assert QuartetCleaner().transform(text) == 'Hello my friends'
    

* Lemmatisation<br>
詞性還原

<code style="background:yellow;color:black">***You can find out if poundFit(assume is a brand name) is a word, using `UnitConverter()` won't convert 'pound' to unit.***</code>

<code style="background:yellow;color:black">***Future Improvements:<br>Add the part of speech(pos) parameter in `nltk.stem.wordnet.WordNetLemmatizer().lemmatize(token,pos)`. Otherwise, it only deal with word which pos is noun. EX, convert plural nouns to noun.***</code>

In [38]:
## lemmatizing for using pretrained word2vec model
# 2nd solution in CrowdFlower
class Lemmatizer:
    '''
    can delete white space and \n(new line character)
    nltk.stem.wordnet.WordNetLemmatizer().lemmatize(token):
        You need to manually specify the part of speech(pos). 
        If you don't set the pos parameter that the default is noun, so only plural nouns can be converged here.
    '''
    def __init__(self):
        self.Tokenizer = nltk.tokenize.TreebankWordTokenizer()
        self.Lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    def transform(self, text):
        tokens = [self.Lemmatizer.lemmatize(token) for token in self.Tokenizer.tokenize(text)]  #cut word and do Lemmatisation
        return ' '.join(tokens) # 'a b c'


In [39]:
# s = '''
#         I found my computers yesterday.
#         It took me along\n time. 
#         You guys know, a data scientist\n has many computers which is a normal thing.
#         By the way, I worked out and running yesterday.
#         I hope is, are, and been can transform to be.
#         Plural nouns: dishes, cities, knives, beliefs, heroes, volcanoes, children, crises
# '''

In [40]:
# Lemmatizer().transform(text = s)

In [41]:
def test_Lemmatizer():
    text = '''I found my computers yesterday.
              It took me along\n time. 
              You guys know, a data scientist\n has many computers which is a normal thing.
              By the way, I worked out to fit my body yesterday.
              I hope is, are, and been can transform to be.
              dishes cities  knives beliefs heroes volcanoes'''
    assert Lemmatizer().transform(text) == 'I found my computer yesterday. It took me along time. You guy know , a data scientist ha many computer which is a normal thing. By the way , I worked out to fit my body yesterday. I hope is , are , and been can transform to be. dish city knife belief hero volcano'

* Porter Stemmer<br>
這種詞幹算法比較舊。它是從 20 世紀 80 年代開始的，其主要關注點是刪除單詞的共同結尾，以便將它們解析爲通用形式。它不是太複雜，它的開發停止了。
通常情況下，它是一個很好的起始基本詞幹分析器，但並不建議將它用於複雜的應用。相反，它在研究中作爲一種很好的基本詞幹算法，可以保證重複性。與其他算法相比，它也是一種非常溫和的詞幹算法。
* Snowball Stemmer<br>
種算法也稱爲 Porter2 詞幹算法。它幾乎被普遍認爲比 Porter 更好，甚至發明 Porter 的開發者也這麼認爲。Snowball 在 Porter 的基礎上加了很多優化。Snowball 與 Porter 相比差異約爲 5％。

In [42]:
## stemming
class Stemmer:
    '''
    Convert uppercase to lowercase
    Delete common endings of words
    Can't delete white space and \n(new line character)
    Can't deal with Lemmatization. Ex. working => work
    In generally, snowball is better than porter. So snowball method will be default.
    '''
    def __init__(self, stemmer_type='snowball'):
        self.stemmer_type = stemmer_type
        if self.stemmer_type == 'porter':
            self.stemmer = nltk.stem.PorterStemmer()
        elif self.stemmer_type == 'snowball':
            self.stemmer = nltk.stem.SnowballStemmer('english')

    def transform(self, text):
        tokens = [self.stemmer.stem(token) for token in text.split(" ")]
        return ' '.join(tokens)

In [43]:
# text = '''Good muffin cost $ 3.88 in New York. Please buy me two of them. Thanks . 
#           I worked from home last \n year when COVID-19 disaster.
#           I am working.
#           '''
# Stemmer().transform(text) == 'good muffin cost $ 3.88 in new york. pleas buy me two of them. thank . \n          i work from home last \n year when covid-19 disaster.\n          i am working.\n          '

In [44]:
def test_Stemmer():
    text = '''Good muffin cost $ 3.88 in New York. Please buy me two of them. Thanks . 
          I worked from home last \n year when COVID-19 disaster.
          I am working.
          '''
    Stemmer().transform(text) == 'good muffin cost $ 3.88 in new york. pleas buy me two of them. thank . \n          i work from home last \n year when covid-19 disaster.\n          i am working.\n          '

* isinstance()<br>
|函數|描述|
|:----:|:----:|
|isinstance(object, classinfo)    |    判斷 object 是否為 classinfo(類別) 的實體|

In [45]:
class ProcessorWrapper:
    '''
    help function input convert to string
    '''
    def __init__(self, processor):
        self.processor = processor

    def transform(self, input):
        if isinstance(input, str): # check input whether is an str class instance
            out = self.processor.transform(input)
        elif isinstance(input, float) or isinstance(input, int):
            out = self.processor.transform(str(input))
        elif isinstance(input, list):
            # take care when the input is a list
            # currently for a list of attributes
            out = [0]*len(input)
            for i in range(len(input)):
                out[i] = ProcessorWrapper(self.processor).transform(input[i])
        else:
            raise(ValueError(f'Currently not support type: {type(input).__name__}'))
        return out


In [46]:
# s='lolololoooooo'

In [47]:
# display(
#     ProcessorWrapper(processor=Stemmer()).transform(s),
#     ProcessorWrapper(processor=Stemmer()).transform(3.14159265358),
#     ProcessorWrapper(processor=Stemmer()).transform(['a','b','b','c', 4, 5, 6.589]),
#     )

In [48]:
def test_ProcessorWrapper():
    s='lolololoooooo'
    assert ProcessorWrapper(processor=Stemmer()).transform(s) == 'lolololoooooo'
    assert ProcessorWrapper(processor=Stemmer()).transform(3.14159265358) == '3.14159265358'
    assert ProcessorWrapper(processor=Stemmer()).transform(['a','b','b','c', 4, 5, 6.589]) == ['a', 'b', 'b', 'c', '4', '5', '6.589']

In [49]:
class ListProcessor:
    """
    WARNING: This class will operate on the original input list itself
    """
    def __init__(self, processors):
        self.processors = processors

    def process(self, lst):
        for i in range(len(lst)):
            for processor in self.processors:
                lst[i] = ProcessorWrapper(processor).transform(lst[i])
        return lst


In [50]:
def test_ListProcessor():
    processors = [Lemmatizer(), Stemmer()]
    lst = ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.', 'Eric Tsai want a under cabinet.']
    assert ListProcessor(processors).process(lst) == ['good muffin cost $ 3.88 in new york. pleas buy me two of them. thank .',
 'eric tsai want a under cabinet .']

In [51]:
class DataFrameProcessor:
    """
    WARNING: This class will operate on the original input dataframe itself
    """
    def __init__(self, processors):
        self.processors = processors

    def process(self, series):  # I change the variable name: df-->series, because it is more clearly.
        for processor in self.processors:
            series = series.apply(ProcessorWrapper(processor).transform)
        return series

In [52]:
# processors = [Lemmatizer(), Stemmer()]
# dic = {'name': ['Eric', 'Ben'], 
#        'age': ['27', '58'],
#        'education': ['National Chengchi University', 'Northwestern University']}
# df = pd.DataFrame(data = dic)

# DataFrameProcessor(processors).process(df.name)

In [53]:
def test_DataFrameProcessor():
    processors = [Lemmatizer(), Stemmer()]
    dic = {'name': ['Eric', 'Ben'], 
           'age': ['27', '58'],
           'education': ['National Chengchi University', 'Northwestern University']}
    df = pd.DataFrame(data = dic)
    for i in range(len(df.name)):
        assert DataFrameProcessor(processors).process(df.name)[i] == pd.Series(['eric', 'ben'], name='name')[i]

<code style="background:yellow;color:black">***Note: This class is parallel function. It can't operate in notebook directly. So I don't want to write unit test here***</code>

In [54]:
## 多進程 (multiprocessing) 必須用 terminal 才可順利運行，或是建一個腳本檔，
## 將所有相關的 function 建置其中，之後再import進 notebook 
class DataFrameParallelProcessor:
    """
    WARNING: This class will operate on the original input dataframe itself

    https://stackoverflow.com/questions/26520781/multiprocessing-pool-whats-the-difference-between-map-async-and-imap
    """
    def __init__(self, processors, n_jobs=1):  # my notebook only has 2 CPU
        self.processors = processors
        self.n_jobs = n_jobs

    def process(self, dfAll, columns):
        df_processor = DataFrameProcessor(self.processors)
        p = multiprocessing.Pool(self.n_jobs)
        dfs = p.imap(df_processor.process, [dfAll[col] for col in columns])
        for col,df in zip(columns, dfs):
            dfAll[col] = df
        return dfAll

In [55]:
# dic = {'name': ['Eric', 'Ben'], 
#    'age': ['27', '58'],
#    'education': ['National Chengchi University', 'Northwestern University']}
# df = pd.DataFrame(data = dic)

In [56]:
# processors = [pp.Lemmatizer(), pp.Stemmer()]
# test_result = pp. DataFrameParallelProcessor(processors).process(df, columns = ['name', 'education'])

In [57]:
# display(df,test_result)

In [58]:
#------------------- Query Expansion -------------------
# 3rd solution in CrowdFlower (Chenglong team decided to remove the feature which might be a major cause of overfitting.)
class QueryExpansion():
    '''
    if stopwords_threshold decrease, the number of stop word will raise
    '''
    def __init__(self, df, ngram=3, stopwords_threshold=0.9, base_stopwords=set()):  # base_stopwords: can add some stop word at the start
        self.df = df[["search_term", "product_title"]].copy()
        self.ngram = ngram
        self.stopwords_threshold = stopwords_threshold
        self.stopwords = set(base_stopwords).union(self._get_customized_stopwords())
        
    def _get_customized_stopwords(self):
        '''
        get stopwords with low frequency
        '''
        words = " ".join(list(self.df["product_title"].values)).split(" ")
        # find word frequency
        counter = Counter(words)
        # find unique word
        num_uniq = len(list(counter.keys()))
        # define the amount of stop word (if a word frequency is too high, then it will be a stopword)
        num_stop = int((1.-self.stopwords_threshold)*num_uniq)
        stopwords = set()
        for e,(w,c) in enumerate(sorted(counter.items(), key=lambda x: x[1])):
            if e == num_stop:
                break
            stopwords.add(w)
        return stopwords

    def _ngram(self, text):
        tokens = text.split(" ")
        tokens = [token for token in tokens if token not in self.stopwords]
        return ngram_utils._ngrams(tokens, self.ngram, " ")

    def _get_alternative_query(self, lst):
        ''' 
        Through product describtion to find the most frequency word in ngram list 
        '''
        res = []
        for v in lst:
            res += v
        c = Counter(res)
        value, count = c.most_common()[0]
        return value

    def build(self):
        '''
        Through the above function '_get_alternative_query' to set the data process
        '''
        self.df["title_ngram"] = self.df["product_title"].apply(self._ngram)
        corpus = self.df.groupby("search_term").apply(lambda x: self._get_alternative_query(x["title_ngram"]))
        corpus = corpus.reset_index()
        corpus.columns = ["search_term", "search_term_alt"]
        self.df = pd.merge(self.df, corpus, on="search_term", how="left")
        return self.df["search_term_alt"].values

### Unit Testing: QueryExpansion 

>### define parameter we need 

In [59]:
# # create data frame
# dic = {'search_term': ['Eric', 'Ben', 'Eric'], 
#    'age': ['27', '58', '66'],
#    'product_title': ['a a a a b b b b c c c c d d d d e e e e f f f f g g g g h h h h i i i i j j j j k k k k l l l l mostA mostA mostA mostA mostA mostA',
#                      'a a a a b b b b c c c c d d d d e e e e f f f f g g g g h h h h i i i i j j j j k k k k l l l l mostB mostB mostB mostB mostB mostB',
#                      'stopword_1 stopword_2 stopword_3 stopword_4 stopword_5']}
# df = pd.DataFrame(data = dic)
# df

>### auto create a stopwords

In [60]:
# stopwords = QueryExpansion(df)._get_customized_stopwords()
# stopwords

>### auto create alternative search term

In [61]:
# df['title_ngram'] = df['product_title'].apply(QueryExpansion(df)._ngram)
# df

In [62]:
# corpus = df.groupby('search_term').apply(lambda df: QueryExpansion(df)._get_alternative_query(df['title_ngram']))
# corpus

>### return the array which is alternative query for search term

In [63]:
# QueryExpansion(df).build()

In [64]:
def test_QueryExpansion():
    # create data frame
    dic = {'search_term': ['Eric', 'Ben', 'Eric'], 
       'age': ['27', '58', '66'],
       'product_title': ['a a a a b b b b c c c c d d d d e e e e f f f f g g g g h h h h i i i i j j j j k k k k l l l l mostA mostA mostA mostA mostA mostA',
                         'a a a a b b b b c c c c d d d d e e e e f f f f g g g g h h h h i i i i j j j j k k k k l l l l mostB mostB mostB mostB mostB mostB',
                         'stopword_1 stopword_2 stopword_3 stopword_4 stopword_5']}
    df_1 = pd.DataFrame(data = dic)
    assert QueryExpansion(df_1)._get_customized_stopwords() == {'stopword_1'}
    
    assert QueryExpansion(df_1)._ngram('a b c d e f g h') == ['a b c', 'b c d', 'c d e', 'd e f', 'e f g', 'f g h']
    
    '''
    assume text = aaaaabc
    n=3
    ngram_list = ['a a a', 'a a a', 'a a a', 'a a b', 'a b c']
    '''
    # function: '_get_alternative_query' will group by search term, so I set the search term be the same here
    dic = {'search_term': ['Eric', 'Eric', 'Eric'], 
           'product_title': ['aaaaabc',
                             'bbbbbac',
                             'aaaaabc']}
    df_2 = pd.DataFrame(data = dic)
    # we can find out the compound word(or string) 'a a a' has the most frequency here
    ngram_series = pd.Series([['a a a', 'a a a', 'a a a', 'a a b', 'a b c'],
                              ['b b b', 'b b b', 'b b b', 'b b a', 'b a c'],
                              ['a a a', 'a a a', 'a a a', 'a a b', 'a b c']])
    # so excepted output is 'a a a'
    assert QueryExpansion(df_2)._get_alternative_query(ngram_series) == 'a a a'
    
    assert (QueryExpansion(df_1).build()[0] 
            == np.array(['mostA mostA mostA', 'mostB mostB mostB', 'mostA mostA mostA'], dtype='object'))[0]
    
    assert (QueryExpansion(df_1).build()[1]
            == np.array(['mostA mostA mostA', 'mostB mostB mostB', 'mostA mostA mostA'], dtype='object'))[1]
    
    assert (QueryExpansion(df_1).build()[2]
            == np.array(['mostA mostA mostA', 'mostB mostB mostB', 'mostA mostA mostA'], dtype='object'))[2]

### Extract Product Name
1. Find color string, using homemade pattern.
2. Find units string, using UnitConverter(), I create before.
3. Find the other pattern, which is not product name and remove it.

In [65]:
#------------------- Extract Product Name -------------------
# 3rd solution in CrowdFlower
color_data = SourceFileLoader('COLOR_LIST', config.COLOR_DATA).load_module()  # import specific .py file which is not has same path with this file
COLORS_PATTERN = r'(?<=\W|^)%s(?=\W|$)'%('|'.join(color_data.COLOR_LIST))
UNITS = [' '.join(r.strip().split(' ')[1:]) for p,r in UnitConverter().pattern_replace_pair_list]
UNITS_PATTERN = r'(?:\d+[?:.,]?\d*)(?: %s\.*)?'%('|'.join(UNITS))
DIM_PATTERN_NxNxN = r'%s ?x %s ?x %s'%(UNITS_PATTERN, UNITS_PATTERN, UNITS_PATTERN)
DIM_PATTERN_NxN = r'%s ?x %s'%(UNITS_PATTERN, UNITS_PATTERN)

In [66]:
# color_data = imp.load_source('', config.COLOR_DATA)
# COLORS_PATTERN = r'(?<=\W|^)%s(?=\W|$)'%('|'.join(color_data.COLOR_LIST))
# COLORS_PATTERN

In [67]:
# # p: original pattern  r: replace pattern
# # strip(): remove head and tail white spaces
# # find all unit which set before by UnitConverter().pattern_replace_pair_list
# UNITS = [' '.join(r.strip().split(' ')[1:]) for p,r in UnitConverter().pattern_replace_pair_list]
# UNITS_PATTERN = r'(?:\d+[?:.,]?\d*)(?: %s\.*)?'%('|'.join(UNITS))
# display(UNITS, UNITS_PATTERN)

In [68]:
# DIM_PATTERN_NxNxN = r'%s ?x %s ?x %s'%(UNITS_PATTERN, UNITS_PATTERN, UNITS_PATTERN)
# DIM_PATTERN_NxN = r'%s ?x %s'%(UNITS_PATTERN, UNITS_PATTERN)
# display(DIM_PATTERN_NxNxN)
# print('===================================================================================================')
# display(DIM_PATTERN_NxN)

<code style="background:yellow;color:black">***In original version, code is not correct. I think the problem is 3rd solution in CrowdFlower don't use 'regex' modual (only use 're' modual). So, in this case, we need to add 'r' in front of string, and replace `\\b` to `\b`. However, this task is not specifically designed for this case. But it is helpful.***</code>

\b: 匹配一個詞的邊界。一個詞的邊界就是一個詞不被另外一個“字”字符跟隨的位置或者前面跟其他“字”字符的位置，例如在字母和空格之間。注意，匹配中不包括匹配的字邊界。換句話說，一個匹配的詞的邊界的內容的長度是0
```python
text = '1234 5687 1918711'
# match '1', but 1 must be back boundary
regex.sub('1\b','', text) = '1234 5687 191871'
# match '1', but 1 must be Front boundary
regex.sub('\b1','', text) = '234 5687 918711'
```

In [69]:
# 3rd solution in CrowdFlower
class ProductNameExtractor(BaseReplacer):
    def __init__(self):
        self.pattern_replace_pair_list = [
            # Remove descriptions (text between paranthesis'()'/brackets'[]')
            (r'[ ]?[[(].+?[])]', r''),
            # Remove 'made in...'
            (r'made in [a-z]+\b', r''),
            # Remove descriptions (hyphen'-' or comma',' followed by space then at most 2 words, repeated)
            (r'([,-]( ([a-zA-Z0-9]+\b)){1,2}[ ]?){1,}$', r''),
            # Remove descriptions (prepositions staring with: with, for, by, in )
            (r'\b(with|for|by|in|w/) .+$', r''),
            # colors & sizes
            (r'size: .+$', r''),
            (r'size [0-9]+[.]?[0-9]+\b', r''),
            (COLORS_PATTERN, r''),
            # dimensions
            (DIM_PATTERN_NxNxN, r''),
            (DIM_PATTERN_NxN, r''),
            # measurement units
            (UNITS_PATTERN, r''),
            # others
            (r'(value bundle|warranty|brand new|excellent condition|one size|new in box|authentic|as is)', r''),
            # stop words
            (r'\b(in)\b', r''),
            # hyphenated words
            (r'([a-zA-Z])-([a-zA-Z])', r'\1\2'),
            # special characters
            (r'[ &<>)(_,.;:!?/+#*-]+', r' '),
            # numbers that are not part of a word
            (r'\b[0-9]+\b', r''),
        ]
        
    def preprocess(self, text):
        pattern_replace_pair_list = [
            # Remove single & double apostrophes
            (r'[\']+', r''),
            # Remove product codes (long words (>5 characters) that are all caps, numbers or mix pf both)
            # don't use raw string format
            (r'[ ]?\b[0-9A-Z-]{5,}\b', r''),
        ]
        text = BaseReplacer(pattern_replace_pair_list).transform(text)
        text = LowerCaseConverter().transform(text)
        text = DigitLetterSplitter().transform(text)
        text = UnitConverter().transform(text)
        text = DigitCommaDigitMerger().transform(text)
        text = NumberDigitMapper().transform(text)
        text = UnitConverter().transform(text)
        return text
        
    def transform(self, text):
        text = super().transform(self.preprocess(text))
        text = Lemmatizer().transform(text)
        text = Stemmer(stemmer_type='snowball').transform(text)
        # last two words in product
        text = ' '.join(text.split(' ')[-2:])
        return text


In [70]:
#------------------- Process Attributes -------------------
def _split_attr_to_text(text):
    attrs = text.split(config.ATTR_SEPARATOR)  # ' | '
    return ' '.join(attrs)

def _split_attr_to_list(text):
    attrs = text.split(config.ATTR_SEPARATOR)        
    if len(attrs) == 1:
        # missing
        return [[attrs[0], attrs[0]]]
    else:  # attrs[::2]: means return values which are according to indexes order 0,2,4,6,... 
        return [[n,v] for n,v in zip(attrs[::2], attrs[1::2])]  # attrs[1::2]: means return values which are according to indexes order 1,3,5,... 


In [71]:
def test_split_attr_to_text():
    text = 'A | A_attr | B | B_attr | C | C_attr'
    assert _split_attr_to_text(text) == 'A A_attr B B_attr C C_attr'

In [72]:
def test_split_attr_to_list():
    text = 'A | A_attr | B | B_attr | C | C_attr'
    assert _split_attr_to_list(text) == [['A', 'A_attr'], ['B', 'B_attr'], ['C', 'C_attr']]

In [73]:
# dfAll = pkl_utils._load(config.ALL_DATA_RAW)

In [74]:
# dfAll.head(3)

In [75]:
# text = dfAll['product_attribute_concat'][0]
# text

In [76]:
# _split_attr_to_text(text)

In [77]:
# _split_attr_to_list(text)

### 1. Record Time

In [78]:
now = time_utils._timestamp()
now

'2021-01-31-18-11'

### 2. Setup

In [79]:
###########
## Setup ##
###########
logname = f'data_processor_{now}.log'
logger = logging_utils._get_logger(config.LOG_DIR, logname)


# Put product_attribute_list, product_attribute and product_description first as they are
# quite time consuming to process.
# Choose the columns by check data_preparer.ipynb. In the end, the notebook will show the clean data frame.
columns_to_proc = [
    # # product_attribute_list is very time consuming to process
    # # so we just process product_attribute which is of the form 
    # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
    # # and split it into a list afterwards
    # 'product_attribute_list',
    'product_attribute_concat',
    'product_description',
    'product_brand', 
    'product_color',
    'product_title',
    'search_term', 
]
if config.PLATFORM == 'Linux':
    config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

In [80]:
# clean using a list of processors
processors = [
    LowerCaseConverter(), 
    # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
    # 其實沒差，除非能處理掉數字加介係詞 in 的狀況不被替代成單位 in.(inch)
    UnitConverter(),
    LowerUpperCaseSplitter(), 
    WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
    LetterLetterSplitter(),
    DigitLetterSplitter(), 
    DigitCommaDigitMerger(), 
    NumberDigitMapper(),
    UnitConverter(), 
    QuartetCleaner(), 
    HtmlCleaner(parser='html.parser'), 
    Lemmatizer(),
]
stemmers = [
    Stemmer(stemmer_type='snowball'), 
    Stemmer(stemmer_type='porter')
][0:1]  # means only use Stemmer(stemmer_type='snowball')

## simple test
text = '1/2 inch rubber lep tips Bullet07'
print('Original:')
print(text)
list_processor = ListProcessor(processors)
print('After:')
print(list_processor.process([text]))

Original:
1/2 inch rubber lep tips Bullet07
After:
['1/2 in. rubber lep tip bullet 07']


### 2. Load Data

In [81]:
#############
## Process ##
#############
## load raw data
dfAll = pkl_utils._load(config.ALL_DATA_RAW)
columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]

In [82]:
if config.TASK == 'sample':
    dfAll = dfAll.iloc[0:config.SAMPLE_SIZE]
    print(f'data length: {len(dfAll)}')

data length: 200


### 3. Extract Product Name from `search_term` and `product_title`

In [83]:
## extract product name from search_term and product_title
ext = ProductNameExtractor()
dfAll['search_term_product_name'] = dfAll['search_term'].apply(ext.transform)
dfAll['product_title_product_name'] = dfAll['product_title'].apply(ext.transform)
if config.TASK == 'sample':
    print(dfAll[['search_term', 'search_term_product_name', 'product_title_product_name']])

                                 search_term search_term_product_name  \
0                              angle bracket             angl bracket   
1                                  l bracket                l bracket   
2                                  deck over                deck over   
3                           rain shower head              shower head   
4                         shower only faucet              onli faucet   
..                                       ...                      ...   
195  rigid lithium ion batteries fuego drill              fuego drill   
196                                  chipper                  chipper   
197                                    bidet                    bidet   
198                            shark cleaner            shark cleaner   
199                             shark vacuum             shark vacuum   

    product_title_product_name  
0                    gaug angl  
1                    gaug angl  
2                 concre

In [84]:
# config.GOOGLE_CORRECTING_QUERY

In [85]:
## clean using GoogleQuerySpellingChecker(Chenglong team not used in final submission)
# MUST BE IN FRONT OF ALL THE PROCESSING
if config.GOOGLE_CORRECTING_QUERY:
    logger.info('Run GoogleQuerySpellingChecker at search_term')
    checker = GoogleQuerySpellingChecker()
    dfAll['search_term'] = dfAll['search_term'].apply(checker.correct)

In [86]:
# dfAll.head(3)

In [87]:
## clean uisng a list of processors
df_processor = pp.DataFrameParallelProcessor(pp.processors, config.DATA_PROCESSOR_N_JOBS)
df_processor.process(dfAll, columns_to_proc)
# split product_attribute_concat into product_attribute and product_attribute_list
dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(_split_attr_to_text)
dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(_split_attr_to_list)
if config.TASK == 'sample':
    print(dfAll[['product_attribute', 'product_attribute_list']])

                                     product_attribute  \
0    bullet 01 versatile connector for various 90 d...   
1    bullet 01 versatile connector for various 90 d...   
2    application method brush roller spray assemble...   
3    bath faucet type combo tub and shower built in...   
4    bath faucet type combo tub and shower built in...   
..                                                 ...   
195  battery included yes battery amp hour 1.5 batt...   
196  amperage amp 13 bag maximum load capacity bush...   
197  bowl height in. 17 bowl shape elongated bullet...   
198                                       missingvalue   
199                                       missingvalue   

                                product_attribute_list  
0    [[bullet 01, versatile connector for various 9...  
1    [[bullet 01, versatile connector for various 9...  
2    [[application method, brush roller spray], [as...  
3    [[bath faucet type, combo tub and shower], [bu...  
4    [[bath faucet

In [88]:
# dfAll[['product_attribute', 'product_attribute_list']]

In [89]:
# config.TASK

In [90]:
# config.QUERY_EXPANSION

In [91]:
# query expansion (Chenglong team decided to remove the feature which might be a major cause of overfitting.)
if config.QUERY_EXPANSION:
    list_processor = ListProcessor(processors)
    # stop words must to access data process. EX. NumberDigitMapper function will replace 'one' to '1'.
    # So, if stop word has 'one', it must replace to '1',too. 
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))  # a set of stop word
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll['search_term_alt'] = qe.build()
    if config.TASK == 'sample':
        print(dfAll[['search_term', 'search_term_alt']])

In [92]:
# save data
logger.info(f'Save to {config.ALL_DATA_LEMMATIZED}')
columns_to_save = [col for col in dfAll.columns if col != 'product_attribute_concat']
pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


In [93]:
# config.AUTO_CORRECTING_QUERY

In [94]:
## auto correcting query(Chenglong team not used in final submission)
if config.AUTO_CORRECTING_QUERY:
    logger.info('Run AutoSpellingChecker at search_term')
    checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
    dfAll['search_term_auto_corrected'] = list(dfAll['search_term'].apply(checker.correct))
    columns_to_proc += ['search_term_auto_corrected']
    if config.TASK == 'sample':
        print(dfAll[['search_term', 'search_term_auto_corrected']])
    # save query_correction_map and spelling checker
    fname = '%s/auto_spelling_checker_query_correction_map_%s.log'%(config.LOG_DIR, now)
    checker.save_query_correction_map(fname)
    # save data
    logger.info('Save to %s'%config.ALL_DATA_LEMMATIZED)
    columns_to_save = [col for col in dfAll.columns if col != 'product_attribute_concat']
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])

In [95]:
## clean using stemmers
df_processor = pp.DataFrameParallelProcessor(pp.stemmers, config.DATA_PROCESSOR_N_JOBS)
df_processor.process(dfAll, columns_to_proc)
# split product_attribute_concat into product_attribute and product_attribute_list
dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(_split_attr_to_text)
dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(_split_attr_to_list)

In [96]:
# config.QUERY_EXPANSION

In [97]:
# query expansion
if config.QUERY_EXPANSION:
    list_processor = ListProcessor(stemmers)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll['search_term_alt'] = qe.build()
    if config.TASK == 'sample':
        print(dfAll[['search_term', 'search_term_alt']])

In [98]:
# save data
logger.info('Save to %s'%config.ALL_DATA_LEMMATIZED_STEMMED)
columns_to_save = [col for col in dfAll.columns if col != 'product_attribute_concat']
pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])

In [99]:
!py.test -vv data_processor.py -Wignore 

platform win32 -- Python 3.7.7, pytest-6.1.1, py-1.9.0, pluggy-0.13.1 -- D:\Anaconda3\envs\HomeDepotProductSearchRelevance\python.exe
cachedir: .pytest_cache
rootdir: D:\python\kaggle_compete\Home_Depot_Product_Search_Relevance\Code
plugins: dash-1.14.0
collecting ... collected 19 items

data_processor.py::test_BaseReplacer PASSED                              [  5%]
data_processor.py::test_LowerCaseConverter PASSED                        [ 10%]
data_processor.py::test_LowerUpperCaseSplitter PASSED                    [ 15%]
data_processor.py::test_WordReplacer PASSED                              [ 21%]
data_processor.py::test_LetterLetterSplitter PASSED                      [ 26%]
data_processor.py::test_DigitLetterSplitter PASSED                       [ 31%]
data_processor.py::test_DigitCommaDigitMerger PASSED                     [ 36%]
data_processor.py::test_NumberDigitMapper PASSED                         [ 42%]
data_processor.py::test_UnitConverter PASSED                           

## Data Processing Modular Design
---

In [100]:
def main():
    ### 1. Record Time
    now = time_utils._timestamp()
    ###########
    ## Setup ##
    ###########
    logname = f'data_processor_{now}.log'
    logger = logging_utils._get_logger(config.LOG_DIR, logname)


    # Put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process.
    # Choose the columns by check data_preparer.ipynb. In the end, the notebook will show the clean data frame.
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # 'product_attribute_list',
        'product_attribute_concat',
        'product_description',
        'product_brand', 
        'product_color',
        'product_title',
        'search_term', 
    ]
    if config.PLATFORM == 'Linux':
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(), 
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        # 其實沒差，除非能處理掉數字加介係詞 in 的狀況不被替代成單位 in.(inch)
        UnitConverter(),
        LowerUpperCaseSplitter(), 
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
        LetterLetterSplitter(),
        DigitLetterSplitter(), 
        DigitCommaDigitMerger(), 
        NumberDigitMapper(),
        UnitConverter(), 
        QuartetCleaner(), 
        HtmlCleaner(parser='html.parser'), 
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type='snowball'), 
        Stemmer(stemmer_type='porter')
    ][0:1]  # means only use Stemmer(stemmer_type='snowball')

    ## simple test
    text = '1/2 inch rubber lep tips Bullet07'
    print('Original:')
    print(text)
    list_processor = ListProcessor(processors)
    print('After:')
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]

    if config.TASK == 'sample':
        dfAll = dfAll.iloc[0:config.SAMPLE_SIZE]
        print(f'data length: {len(dfAll)}')

    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll['search_term_product_name'] = dfAll['search_term'].apply(ext.transform)
    dfAll['product_title_product_name'] = dfAll['product_title'].apply(ext.transform)
    if config.TASK == 'sample':
        print(dfAll[['search_term', 'search_term_product_name', 'product_title_product_name']])

    ## clean using GoogleQuerySpellingChecker(Chenglong team not used in final submission)
    # MUST BE IN FRONT OF ALL THE PROCESSING
    if config.GOOGLE_CORRECTING_QUERY:
        logger.info('Run GoogleQuerySpellingChecker at search_term')
        checker = GoogleQuerySpellingChecker()
        dfAll['search_term'] = dfAll['search_term'].apply(checker.correct)

    ## clean uisng a list of processors
    df_processor = pp.DataFrameParallelProcessor(pp.processors, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(_split_attr_to_text)
    dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(_split_attr_to_list)
    if config.TASK == 'sample':
        print(dfAll[['product_attribute', 'product_attribute_list']])


    # query expansion (Chenglong team decided to remove the feature which might be a major cause of overfitting.)
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(processors)
        # stop words must to access data process. EX. NumberDigitMapper function will replace 'one' to '1'.
        # So, if stop word has 'one', it must replace to '1',too. 
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))  # a set of stop word
        qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
        dfAll['search_term_alt'] = qe.build()
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_alt']])

    # save data
    logger.info(f'Save to {config.ALL_DATA_LEMMATIZED}')
    columns_to_save = [col for col in dfAll.columns if col != 'product_attribute_concat']
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## auto correcting query(Chenglong team not used in final submission)
    if config.AUTO_CORRECTING_QUERY:
        logger.info('Run AutoSpellingChecker at search_term')
        checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
        dfAll['search_term_auto_corrected'] = list(dfAll['search_term'].apply(checker.correct))
        columns_to_proc += ['search_term_auto_corrected']
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_auto_corrected']])
        # save query_correction_map and spelling checker
        fname = '%s/auto_spelling_checker_query_correction_map_%s.log'%(config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info('Save to %s'%config.ALL_DATA_LEMMATIZED)
        columns_to_save = [col for col in dfAll.columns if col != 'product_attribute_concat']
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])

    ## clean using stemmers
    df_processor = pp.DataFrameParallelProcessor(pp.stemmers, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(_split_attr_to_text)
    dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(_split_attr_to_list)


    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(stemmers)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
        dfAll['search_term_alt'] = qe.build()
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_alt']])

    # save data
    logger.info('Save to %s'%config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [col for col in dfAll.columns if col != 'product_attribute_concat']
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])

    
    
    
    
    
    
if __name__ == "__main__":
    main()

Original:
1/2 inch rubber lep tips Bullet07
After:
['1/2 in. rubber lep tip bullet 07']
data length: 200
                                 search_term search_term_product_name  \
0                              angle bracket             angl bracket   
1                                  l bracket                l bracket   
2                                  deck over                deck over   
3                           rain shower head              shower head   
4                         shower only faucet              onli faucet   
..                                       ...                      ...   
195  rigid lithium ion batteries fuego drill              fuego drill   
196                                  chipper                  chipper   
197                                    bidet                    bidet   
198                            shark cleaner            shark cleaner   
199                             shark vacuum             shark vacuum   

    product_title_

In [101]:
# convert notebook.ipynb to a .py file
!jupytext --to py data_processor.ipynb

[jupytext] Reading data_processor.ipynb in format ipynb
[jupytext] Writing data_processor.py (destination file replaced)
