In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import re

from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from selenium import webdriver

from sklearn.preprocessing import OneHotEncoder

In [2]:
def get_tables(html_table, tag=['th', 'tr', 'td']):
    table_value = []


    # for my_table in tables:

    # You can find children with multiple tags by passing a list of strings
    rows = html_table.findChildren(tag)

    for row in rows:
        cells = row.findChildren(tag)
        for cell in cells:
            value = cell.string
            if value:
                table_value.append(value.strip())
                # print("The value in this cell is %s" % value)
            else:
                table_value.append("None")
    return dict(zip(table_value[::2], table_value[1::2]))

In [3]:
def parse_page(url):
    
    # get the html file
    page = requests.get(url)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')

        # get all the tables
        tables = soup.findChildren('table')

        # only the first 4 tables are useful
        tables = tables[:4]


        # UPPER LEFT TABLE
        table_ul = get_tables(tables[0])

        # UPPER RIGHT TABLE
        rows_html = tables[1].findAll("span", {"class": lambda x: x and x.startswith("rating")})
        rows = [x.get_text() for x in rows_html]
        table_ur = dict(zip(rows[::2], rows[1::2]))

#         # BUTTOM TABLES

#         table_b = {}
#         for t in tables[2:]:
#             l = get_tables(t, ['td', 'p', 'tr'])
#             table_b.update(l)


        all_items = {**table_ul, **table_ur}
    
    else:
        print(page.status_code)


    return all_items

In [4]:
def get_date(input_text):
    
    year = re.findall('[0-9]{4}', input_text)
    
    return year

In [7]:
def get_all_boards(gender):
    
    browser = webdriver.Safari()
    browser.get("https://thegoodride.com/snowboard-reviews/?{}=1".format(gender))
    time.sleep(1)

    button = browser.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/form/div[1]/div[42]/a')
    browser.execute_script("arguments[0].click();", button)
    time.sleep(400)
    html = browser.page_source

    browser.close()
    
    return html


In [8]:
genders = ['mens', 'womens']

for g in genders:
    
    raw_html = get_all_boards(g)
    
    soup = BeautifulSoup(raw_html, "html.parser")
    rows_html = soup.findAll("div", {"class": "board-reviews animate"})

    all_boards = []
        
    for board in rows_html:
        board_name = board.select('h4')[0].text.strip()
        review_url = board.select('a', href=True)[0]['href']
        all_boards.append([board_name, review_url])
        
    df_url = pd.DataFrame(all_boards, columns=['board_name', 'url'])
    print(df_url.shape)

    df_url.to_csv('../data/all_{}_boards.csv'.format(g), index=False)
    
    rating_list = []
    for url in tqdm(df_url['url']):
        d = parse_page(url)
        assert len(d) == 18
        rating_list.append(d)
        time.sleep(.1)
        
    df_rating = pd.DataFrame(rating_list)
    df_final = pd.concat([df_url, df_rating], axis=1)
    df_final['year'] = df_final['url'].apply(get_date)
    df_final.to_csv('../data/{}_board_20201215.csv'.format(g), index=False)

  0%|          | 0/424 [00:00<?, ?it/s]

(424, 2)


100%|██████████| 424/424 [24:01<00:00,  3.40s/it]
  0%|          | 0/114 [00:00<?, ?it/s]

(114, 2)


100%|██████████| 114/114 [05:48<00:00,  3.06s/it]


In [11]:
for col in df_final.iloc[:, 2:]:
    print(df_final[col].unique())

In [10]:
df_final.head()

Unnamed: 0,board_name,url,Overall Rating,Riding Style,Riding Level,Fits Boot size (US),Manufactured in,Shape,Camber Profile,Stance,...,Powder,Turning Experience,Carving,Speed,Uneven Terrain,Switch,Jumps,Jibbing,Pipe,year
0,Alloy Pandora,https://thegoodride.com/snowboard-reviews/allo...,,All Mountain Freestyle,Expert,Women's,Tunisia by Nidecker,Directional Twin,Traditional Camber,Centered,...,Poor,Great,Excellent,Great,Good,Great,Excellent,Average,Excellent,[2019]
1,Arbor Ethos,https://thegoodride.com/snowboard-reviews/arbo...,Liked it!,All Mountain Freestyle,Beginner - Intermediate,"Women's, < 8, 8-10",Dubai by SWS,Directional Twin,Continuous Rocker,Setback -5mm,...,Good,Good,Poor,Average,Good,Great,Good,Great,Average,[2021]
2,Arbor Poparazzi Camber,https://thegoodride.com/snowboard-reviews/arbo...,Liked it!,All Mountain,Intermediate - Advanced,"Women's, < 8, 8-10",Dubai by SWS,Directional Twin,Traditional Camber,Centered,...,Average,Great,Good,Good,Great,Excellent,Great,Good,Great,[2021]
3,Arbor Swoon,https://thegoodride.com/snowboard-reviews/arbo...,Pretty Good,All Mountain Freestyle,Beginner - Expert,Women's,Dubai by SWS,Directional Twin,Continuous Rocker,Centered,...,Average,Average,Average,Average,Excellent,Great,Excellent,Good,Good,"[2011, 2018]"
4,Arbor Swoon Camber,https://thegoodride.com/snowboard-reviews/arbo...,Pretty Good,All Mountain Freestyle,Advanced - Expert,Women's,Dubai by SWS,Directional Twin,Mostly Camber,Centered,...,Average,Good,Great,Great,Good,Great,Excellent,Good,Great,[]


# Board Similartiy

In [None]:
filter_cols = ['Riding Style', 'Riding Level', 'Shape', 'Camber Profile',
       'Stance', 'Approx. Weight', 'Powder', 'Turning Experience', 'Carving',
       'Speed', 'Uneven Terrain', 'Switch', 'Jumps', 'Jibbing', 'Pipe' ]

In [None]:
df_final[filter_cols].head()

In [None]:
for col in filter_cols:
    print(df_final[col].unique())

In [None]:
# df_final[filter_cols].get_dummies()
df_final = pd.get_dummies(df_final, columns=filter_cols)

In [None]:
df_final.drop_duplicates(subset=['board_name'])