In [5]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import re

from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from selenium import webdriver

from sklearn.preprocessing import OneHotEncoder
from annoy import AnnoyIndex

In [None]:
def get_tables(html_table, tag=['th', 'tr', 'td']):
    table_value = []


    # for my_table in tables:

    # You can find children with multiple tags by passing a list of strings
    rows = html_table.findChildren(tag)

    for row in rows:
        cells = row.findChildren(tag)
        for cell in cells:
            value = cell.string
            if value:
                table_value.append(value.strip())
                # print("The value in this cell is %s" % value)
            else:
                table_value.append("None")
    return dict(zip(table_value[::2], table_value[1::2]))

In [None]:
def parse_page(url):
    
    # get the html file
    page = requests.get(url)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')

        # get all the tables
        tables = soup.findChildren('table')

        # only the first 4 tables are useful
        tables = tables[:4]


        # UPPER LEFT TABLE
        table_ul = get_tables(tables[0])

        # UPPER RIGHT TABLE
        rows_html = tables[1].findAll("span", {"class": lambda x: x and x.startswith("rating")})
        rows = [x.get_text() for x in rows_html]
        table_ur = dict(zip(rows[::2], rows[1::2]))

#         # BUTTOM TABLES

#         table_b = {}
#         for t in tables[2:]:
#             l = get_tables(t, ['td', 'p', 'tr'])
#             table_b.update(l)


        all_items = {**table_ul, **table_ur}
    
    else:
        print(page.status_code)


    return all_items

In [None]:
def get_date(input_text):
    
    year = re.findall('[0-9]{4}', input_text)
    
    return year

In [None]:
def get_all_boards(gender):
    
    browser = webdriver.Safari()
    browser.get("https://thegoodride.com/snowboard-reviews/?{}=1".format(gender))
    time.sleep(1)

    button = browser.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/form/div[1]/div[42]/a')
    browser.execute_script("arguments[0].click();", button)
    time.sleep(400)
    html = browser.page_source

    browser.close()
    
    return html


In [None]:
genders = ['mens', 'womens']

for g in genders:
    
    raw_html = get_all_boards(g)
    
    soup = BeautifulSoup(raw_html, "html.parser")
    rows_html = soup.findAll("div", {"class": "board-reviews animate"})

    all_boards = []
        
    for board in rows_html:
        board_name = board.select('h4')[0].text.strip()
        review_url = board.select('a', href=True)[0]['href']
        all_boards.append([board_name, review_url])
        
    df_url = pd.DataFrame(all_boards, columns=['board_name', 'url'])
    print(df_url.shape)

    df_url.to_csv('../data/all_{}_boards.csv'.format(g), index=False)
    
    rating_list = []
    for url in tqdm(df_url['url']):
        d = parse_page(url)
        assert len(d) == 18
        rating_list.append(d)
        time.sleep(.1)
        
    df_rating = pd.DataFrame(rating_list)
    df_final = pd.concat([df_url, df_rating], axis=1)
    df_final['year'] = df_final['url'].apply(get_date)
    df_final.to_csv('../data/{}_board_20201215.csv'.format(g), index=False)

In [None]:
for col in df_final.iloc[:, 2:]:
    print(df_final[col].unique())

In [None]:
df_final.head()

# Board Recomendation

In [91]:
df_final = pd.read_csv('../data/mens_board_20201215.csv')
df_final.shape

(424, 21)

In [92]:
filter_cols = ['Riding Style', 'Riding Level', 'Shape', 'Camber Profile',
       'Stance', 'Approx. Weight', 'Powder', 'Turning Experience', 'Carving',
       'Speed', 'Uneven Terrain', 'Switch', 'Jumps', 'Jibbing', 'Pipe' ]

In [93]:
df_final[filter_cols].head()

Unnamed: 0,Riding Style,Riding Level,Shape,Camber Profile,Stance,Approx. Weight,Powder,Turning Experience,Carving,Speed,Uneven Terrain,Switch,Jumps,Jibbing,Pipe
0,All Mountain Freestyle,Beginner - Expert,True Twin,Hybrid Camber,Centered,Feels Normal,Average,Good,Good,Great,Good,Great,Good,Average,Great
1,Freestyle,Intermediate - Expert,True Twin,Hybrid Camber,Centered,Feels Normal,Average,Average,Average,Average,Good,Great,Good,Great,Good
2,Freeride,Expert,Tapered Directional,Traditional Camber,Setback over 20mm,Feels Heavy,Poor,Great,Excellent,Excellent,Poor,Average,Good,Poor,Poor
3,Alternative Freeride,Advanced - Expert,Tapered Directional,Directional Camber,Setback over 20mm,Feels Normal,Good,Great,Great,Great,Great,Poor,Average,Poor,Average
4,Freeride,Expert,Tapered Directional,Mostly Camber,Setback over 20mm,Feels Normal,Poor,Excellent,Excellent,Great,Average,Average,Average,Poor,Poor


In [94]:
# for col in filter_cols:
#     print(df_final[col].unique())

In [95]:
# df_final[filter_cols].get_dummies()
df_final_dummies = pd.get_dummies(df_final, columns=filter_cols)
df_final_dummies.shape

(424, 92)

In [96]:
meta_cols = ['board_name', 'url', 'Overall Rating',
             'Fits Boot size (US)', 'Manufactured in', 'year']

### ANNOY

In [12]:
f = df_final.shape[1] - len(meta_cols)

In [15]:
a = AnnoyIndex(f, 'euclidean')

In [16]:
for idx, row in df_final.drop(labels=meta_cols, axis=1).iterrows():
     a.add_item(idx, row.to_list())
#     print(row.values)

In [None]:
a.build(10)

In [None]:
a.save('test.ann')

# ...

u = AnnoyIndex(f, 'euclidean')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors

### Pairwise distance

In [97]:
from scipy.spatial.distance import squareform, pdist

def distance_function(r):
    dists = pdist(r, 'jaccard')
    
    return pd.DataFrame(squareform(dists))

In [98]:
df_dist = distance_function(df_final_dummies.drop(labels=meta_cols, axis=1).values)

In [99]:
reference_index = df_final[df_final['board_name'].str.contains('Jones  Mountain Twin')].index[0]

In [100]:
target = df_dist.iloc[:, reference_index].nsmallest(6)
target = target.iloc[1:]
target

234    0.235294
177    0.333333
368    0.333333
205    0.421053
229    0.421053
Name: 173, dtype: float64

In [109]:
df_rec = df_final.iloc[taget_index, :]
df_rec['similarity'] =  target.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [110]:
df_rec

Unnamed: 0,board_name,url,Overall Rating,Riding Style,Riding Level,Fits Boot size (US),Manufactured in,Shape,Camber Profile,Stance,...,Turning Experience,Carving,Speed,Uneven Terrain,Switch,Jumps,Jibbing,Pipe,year,similarity
234,Lib Tech Terrain Wrecker,https://thegoodride.com/snowboard-reviews/lib-...,Loved it!,All Mountain,Beginner - Expert,"< 8, 8-10, 10-12, > 12",USA by Mervin,Directional Twin,Hybrid Rocker,Centered,...,Good,Good,Good,Good,Great,Great,Good,Great,"['2018', '2020']",0.235294
177,Jones Ultra Mountain Twin,https://thegoodride.com/snowboard-reviews/jone...,Loved it!,All Mountain,Intermediate - Expert,"8-10, 10-12",Dubai by SWS,Directional Twin,Hybrid Camber,Centered,...,Good,Good,Great,Good,Great,Great,Average,Good,"['2016', '2021']",0.333333
368,Sims Dealers Choice,https://thegoodride.com/snowboard-reviews/sims...,Liked it!,All Mountain Freestyle,Intermediate - Expert,"8-10, 10-12",USA by Never Summer,Directional Twin,Hybrid Camber,Centered,...,Good,Good,Good,Good,Great,Good,Good,Good,['2017'],0.333333
205,Lib Tech Box Knife,https://thegoodride.com/snowboard-reviews/lib-...,,Freestyle,Intermediate - Expert,"< 8, 8-10, 10-12",USA by Mervin,True Twin,Hybrid Camber,Centered,...,Good,Good,Good,Good,Great,Great,Good,Great,"['2018', '2020']",0.421053
229,Lib Tech Skunk Ape HP,https://thegoodride.com/snowboard-reviews/lib-...,Liked it!,All Mountain,Beginner - Expert,> 12,USA by Mervin,Directional Twin,Hybrid Rocker,Setback -12.5mm,...,Good,Good,Good,Good,Good,Great,Good,Great,"['2013', '2020']",0.421053
