In [None]:
import pandas as pd
from databaker.framework import *
from tutorialResources.scraper import Scraper

scraper = Scraper("https://www.fake-website.com/example1")

tabs = scraper.distribution.as_databaker()


# Apply

The databaker apply keyword is a means to pass a callable (a function, a class or a lambda function) into a HDim constructor.

There are two drivers for this:
    
- Simplification of transforms
- Performance - if you modify something before you pivot it, it's **very** efficient compared to post processing.

The following is just the unmodified transform.

In [None]:

tidied_sheets = []
for tab in tabs:       

    # define a selection of cells as the observations
    observations = tab.excel_ref('C5').expand(DOWN).expand(RIGHT).is_not_blank()
    
    # define other selections of cells to be our dimensions
    assets = tab.excel_ref('C3').expand(RIGHT).is_not_blank()
    names = tab.excel_ref('B5').expand(DOWN).is_not_blank()
    group = tab.excel_ref('A5').expand(DOWN).is_not_blank()

    #define the relationships of the cells selected as dimensions (relative to the observations)
    dimensions = [
              HDim(assets, "Assets", DIRECTLY, ABOVE), 
              HDim(names, "Name", DIRECTLY, LEFT), 
              HDim(group, "Group", CLOSEST, ABOVE) 
                 ]
    
    # Now we process these relationship for this tab (this code rarely changes)
    tidy_sheet = ConversionSegment(tab, dimensions, observations) # < --- processing
    #savepreviewhtml(tidy_sheet)
    
    tidied_sheets.append(tidy_sheet.topandas()) # <-- adding result of processing this tab to our list
    
output = pd.concat(tidied_sheets)
output[:10]

# Example 1:
    
Let's add a "The" prefix to the group names:

Note - the only change is the "group" HDim

In [None]:
tidied_sheets = []
for tab in tabs:       

    # define a selection of cells as the observations
    observations = tab.excel_ref('C5').expand(DOWN).expand(RIGHT).is_not_blank()
    
    # define other selections of cells to be our dimensions
    assets = tab.excel_ref('C3').expand(RIGHT).is_not_blank()
    names = tab.excel_ref('B5').expand(DOWN).is_not_blank()
    group = tab.excel_ref('A5').expand(DOWN).is_not_blank()

    #define the relationships of the cells selected as dimensions (relative to the observations)
    dimensions = [
              HDim(assets, "Assets", DIRECTLY, ABOVE), 
              HDim(names, "Name", DIRECTLY, LEFT), 
              HDim(group, "Group", CLOSEST, ABOVE, apply=lambda x: f'The {x}') 
                 ]
    
    # Now we process these relationship for this tab (this code rarely changes)
    tidy_sheet = ConversionSegment(tab, dimensions, observations) # < --- processing
    #savepreviewhtml(tidy_sheet)
    
    tidied_sheets.append(tidy_sheet.topandas()) # <-- adding result of processing this tab to our list
    
output = pd.concat(tidied_sheets)
output[:10]

# Example 2:
    
Let's pass in a more complicated function that url'ify the beatles but ignore the rolling stones.
    
Note - the only change is the "name" HDim

In [None]:
def beatles_as_urls(value):
    d = {
        "John": "beatles/john/singer",
        "Paul": "beatles/paul/bass",
        "Ringo": "beatles/ringo/drums",
        "George": "beatles/geaoge/guitar"
    }
    return d.get(value, value)

tidied_sheets = []
for tab in tabs:       

    # define a selection of cells as the observations
    observations = tab.excel_ref('C5').expand(DOWN).expand(RIGHT).is_not_blank()
    
    # define other selections of cells to be our dimensions
    assets = tab.excel_ref('C3').expand(RIGHT).is_not_blank()
    names = tab.excel_ref('B5').expand(DOWN).is_not_blank()
    group = tab.excel_ref('A5').expand(DOWN).is_not_blank()

    #define the relationships of the cells selected as dimensions (relative to the observations)
    dimensions = [
              HDim(assets, "Assets", DIRECTLY, ABOVE), 
              HDim(names, "Name", DIRECTLY, LEFT, apply=beatles_as_urls), 
              HDim(group, "Group", CLOSEST, ABOVE) 
                 ]
    
    # Now we process these relationship for this tab (this code rarely changes)
    tidy_sheet = ConversionSegment(tab, dimensions, observations) # < --- processing
    #savepreviewhtml(tidy_sheet)
    
    tidied_sheets.append(tidy_sheet.topandas()) # <-- adding result of processing this tab to our list
    
output = pd.concat(tidied_sheets)
output[:10]

# Example 3

You can pass in a sequence if callables via a tuple if you want to.

In this case we're going to pass in two lambda functions:

- Add a prefix of "lots of"
- Replace the term "lots" with "many types" but _only_ for houses

For this one look at the Assets HDim constructor.

_Note - pretty ugly but this is just a quick example_.

In [None]:
tidied_sheets = []
for tab in tabs:       

    # define a selection of cells as the observations
    observations = tab.excel_ref('C5').expand(DOWN).expand(RIGHT).is_not_blank()
    
    # define other selections of cells to be our dimensions
    assets = tab.excel_ref('C3').expand(RIGHT).is_not_blank()
    names = tab.excel_ref('B5').expand(DOWN).is_not_blank()
    group = tab.excel_ref('A5').expand(DOWN).is_not_blank()

    #define the relationships of the cells selected as dimensions (relative to the observations)
    dimensions = [
              HDim(assets, "Assets", DIRECTLY, ABOVE, apply=(lambda x: f'lots of {x}',lambda x: x.replace("lots", "many types") if "Houses" in x else x)), 
              HDim(names, "Name", DIRECTLY, LEFT), 
              HDim(group, "Group", CLOSEST, ABOVE) 
                 ]
    
    # Now we process these relationship for this tab (this code rarely changes)
    tidy_sheet = ConversionSegment(tab, dimensions, observations) # < --- processing
    #savepreviewhtml(tidy_sheet)
    
    tidied_sheets.append(tidy_sheet.topandas()) # <-- adding result of processing this tab to our list
    
output = pd.concat(tidied_sheets)
output[:10]