### Extracting and Simple Preprocessing

What I did:
- Extracted tables with bs4, then query, value and percent (link, total visits and % visited) from each tables rows
- Expanded values like "27k" to 27000 and changed it to integers
- Transformed the the data into a new dataframe with one row for each count of value column
- Extracted Query Params from query column
- Transformed query params to individual columns

In [3]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote

# read in all data of 9 html files and join them to another html file
full_html = []
for i in range(1,10):
    file = open('./umami/umami-mentorship-all-time-{}.htm'.format(i), "r")
    html = file.read()
    
    soup = BeautifulSoup(html, 'lxml')
    html_table = str(soup.find('div', attrs={'class': 'DataTable_body__tW6Gx'}))
    full_html.append(html_table)

print(len(full_html))
full_html = " ".join(full_html)

9


In [4]:
soup = BeautifulSoup(full_html, 'lxml')

# find table with the class of DataTable_body__tW6Gx
tables = soup.find_all('div', attrs={'class': 'DataTable_body__tW6Gx'})

print(len(tables))

# create df
df = pd.DataFrame(columns=['query', 'value', 'percent'])

for table in tables:
    # goes into every html page's rows
    # find all to get all rows in each page
    query = table.find_all('div', attrs={'class': 'DataTable_label__nk_tp'})
    value = table.find_all('div', attrs={'class': 'DataTable_value__dgeWm'})
    percent = table.find_all('div', attrs={'class': 'DataTable_percent__EBsIj'})

    # add to df
    for i in range(len(query)):
        df = pd.concat([df, pd.DataFrame({'query': query[i].text,
                                        'value': value[i].text,
                                        'percent': percent[i].text},
                                        index=[0])], ignore_index=True)

9


Preprocessing

In [5]:
def expandValue(value):
    if value.endswith('k'):
        value = float(value[:-1])
        value *= 1000

        # change back to integer
        value = int(value)
    return value

df['value'] = df['value'].apply(expandValue)
df.head()

# show full dataset and pandas show full width everything
# pd.set_option('display.max_colwidth', None)

Unnamed: 0,query,value,percent
0,/?size=n_20_n,27800,25%
1,/,27800,12%
2,/?size=n_20_n&filters[0][field]=industries&fil...,13600,1%
3,/?current=n_2_n&size=n_20_n,13600,1%
4,/?size=n_60_n,1290,1%


In [6]:
df.rename(columns={'query': 'visitor_url'}, inplace=True)

def extract_query_params(url):
    url = unquote(url)
    query_params = parse_qs(urlparse(url).query)
    return query_params

df['query_params'] = df['visitor_url'].apply(extract_query_params)

In [7]:
df = pd.concat([df, df['query_params'].apply(lambda x: pd.Series(x, dtype='object'))], axis=1)

df.head()

Unnamed: 0,visitor_url,value,percent,query_params,size,filters[0][field],filters[0][values][0],filters[0][type],current,sort-field,...,filters[1][type],filters[1][values][1],filters[1][values][2],filters[1][values][3],filters[1][values][4],filters[0][values][2],filters[0][values][3],filters[0][values][4],filters[0][values][5],filters[0][values][6]
0,/?size=n_20_n,27800,25%,{'size': ['n_20_n']},[n_20_n],,,,,,...,,,,,,,,,,
1,/,27800,12%,{},,,,,,,...,,,,,,,,,,
2,/?size=n_20_n&filters[0][field]=industries&fil...,13600,1%,"{'size': ['n_20_n'], 'filters[0][field]': ['in...",[n_20_n],[industries],[Banking and Finance],[all],,,...,,,,,,,,,,
3,/?current=n_2_n&size=n_20_n,13600,1%,"{'current': ['n_2_n'], 'size': ['n_20_n']}",[n_20_n],,,,[n_2_n],,...,,,,,,,,,,
4,/?size=n_60_n,1290,1%,{'size': ['n_60_n']},[n_60_n],,,,,,...,,,,,,,,,,


In [8]:
# export df
print("Total Queries: {}".format(len(df)))
df.to_csv('umami_preprocessed.csv', index=False)

Total Queries: 191


If we want the full 115k rows, run the below code.

In [105]:
# # sum up all values in value column
# df['value'] = df['value'].astype(int)
# df['value'].sum()

115517

In [None]:
# # transform the data into a new dataframe with one row for each count of value column
# df['value'] = df['value'].astype(int)
# df['copy'] = df['value'].apply(lambda x: list(range(x)))
# df = df.explode('copy').drop(columns='value')
# df.drop(columns='copy', inplace=True)
# df = df.reset_index(drop=True)