In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import re

from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from selenium import webdriver

from sklearn.preprocessing import OneHotEncoder
from annoy import AnnoyIndex

In [None]:
def get_tables(html_table, tag=['th', 'tr', 'td']):
    table_value = []


    # for my_table in tables:

    # You can find children with multiple tags by passing a list of strings
    rows = html_table.findChildren(tag)

    for row in rows:
        cells = row.findChildren(tag)
        for cell in cells:
            value = cell.string
            if value:
                table_value.append(value.strip())
                # print("The value in this cell is %s" % value)
            else:
                table_value.append("None")
    return dict(zip(table_value[::2], table_value[1::2]))

In [None]:
def parse_page(url):
    
    # get the html file
    page = requests.get(url)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')

        # get all the tables
        tables = soup.findChildren('table')

        # only the first 4 tables are useful
        tables = tables[:4]


        # UPPER LEFT TABLE
        table_ul = get_tables(tables[0])

        # UPPER RIGHT TABLE
        rows_html = tables[1].findAll("span", {"class": lambda x: x and x.startswith("rating")})
        rows = [x.get_text() for x in rows_html]
        table_ur = dict(zip(rows[::2], rows[1::2]))

#         # BUTTOM TABLES

#         table_b = {}
#         for t in tables[2:]:
#             l = get_tables(t, ['td', 'p', 'tr'])
#             table_b.update(l)


        all_items = {**table_ul, **table_ur}
    
    else:
        print(page.status_code)


    return all_items

In [None]:
def get_date(input_text):
    
    year = re.findall('[0-9]{4}', input_text)
    
    return year

In [None]:
def get_all_boards(gender):
    
#     browser = webdriver.Safari()
    browser = webdriver.Chrome()
    browser.get("https://thegoodride.com/snowboard-reviews/?{}=1".format(gender))
    time.sleep(1)

    button = browser.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/form/div[1]/div[42]/a')
    browser.execute_script("arguments[0].click();", button)
    time.sleep(400)
    html = browser.page_source

    browser.close()
    
    return html


In [None]:
genders = ['mens', 'womens']

for g in genders:
    
    raw_html = get_all_boards(g)
    
    soup = BeautifulSoup(raw_html, "html.parser")
    rows_html = soup.findAll("div", {"class": "board-reviews animate"})

    all_boards = []
        
    for board in rows_html:
        board_name = board.select('h4')[0].text.strip()
        review_url = board.select('a', href=True)[0]['href']
        all_boards.append([board_name, review_url])
        
    df_url = pd.DataFrame(all_boards, columns=['board_name', 'url'])
    print(df_url.shape)

    df_url.to_csv('../data/all_{}_boards.csv'.format(g), index=False)
    
    rating_list = []
    for url in tqdm(df_url['url']):
        d = parse_page(url)
        assert len(d) == 18
        rating_list.append(d)
        time.sleep(.1)
        
    df_rating = pd.DataFrame(rating_list)
    df_final = pd.concat([df_url, df_rating], axis=1)
    df_final['year'] = df_final['url'].apply(get_date)
    df_final.to_csv('../data/{}_board_20201215.csv'.format(g), index=False)

In [None]:
for col in df_final.iloc[:, 2:]:
    print(df_final[col].unique())

In [None]:
df_final.head()

In [37]:
import boto3
import os
import re
from pathlib import Path
from tqdm import tqdm


from botocore.exceptions import ClientError

def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [36]:
create_bucket("snowboard-finder")

True

In [39]:
upload_file('../data/mens_board_20201215.csv', "snowboard-finder", "mens_board_20201215.csv")

True

# Board Recomendation

In [2]:
df_final = pd.read_csv('../data/mens_board_20201215.csv')
df_final.shape

(424, 21)

In [3]:
filter_cols = ['Riding Style', 'Riding Level', 'Shape', 'Camber Profile',
       'Stance', 'Approx. Weight', 'Powder', 'Turning Experience', 'Carving',
       'Speed', 'Uneven Terrain', 'Switch', 'Jumps', 'Jibbing', 'Pipe' ]

In [4]:
df_final.head()

Unnamed: 0,board_name,url,Overall Rating,Riding Style,Riding Level,Fits Boot size (US),Manufactured in,Shape,Camber Profile,Stance,...,Powder,Turning Experience,Carving,Speed,Uneven Terrain,Switch,Jumps,Jibbing,Pipe,year
0,Alloy B-Bomb,https://thegoodride.com/snowboard-reviews/allo...,Liked it!,All Mountain Freestyle,Beginner - Expert,8-10,Tunisia by Nidecker,True Twin,Hybrid Camber,Centered,...,Average,Good,Good,Great,Good,Great,Good,Average,Great,"['2016', '2020']"
1,Alloy B-Bomb GT,https://thegoodride.com/snowboard-reviews/allo...,,Freestyle,Intermediate - Expert,8-10,Tunisia by Nidecker,True Twin,Hybrid Camber,Centered,...,Average,Average,Average,Average,Good,Great,Good,Great,Good,['2020']
2,Alloy D.O Carbon,https://thegoodride.com/snowboard-reviews/allo...,Pretty Good,Freeride,Expert,8-10,Tunisia by Nidecker,Tapered Directional,Traditional Camber,Setback over 20mm,...,Poor,Great,Excellent,Excellent,Poor,Average,Good,Poor,Poor,"['2019', '2020']"
3,Alloy Darwin Flow,https://thegoodride.com/snowboard-reviews/allo...,,Alternative Freeride,Advanced - Expert,8-10,Tunisia by Nidecker,Tapered Directional,Directional Camber,Setback over 20mm,...,Good,Great,Great,Great,Great,Poor,Average,Poor,Average,['2020']
4,Alloy DO 7X,https://thegoodride.com/snowboard-reviews/allo...,Liked it!,Freeride,Expert,8-10,Tunisia by Nidecker,Tapered Directional,Mostly Camber,Setback over 20mm,...,Poor,Excellent,Excellent,Great,Average,Average,Average,Poor,Poor,['2017']


In [5]:
df_final[filter_cols].head()

Unnamed: 0,Riding Style,Riding Level,Shape,Camber Profile,Stance,Approx. Weight,Powder,Turning Experience,Carving,Speed,Uneven Terrain,Switch,Jumps,Jibbing,Pipe
0,All Mountain Freestyle,Beginner - Expert,True Twin,Hybrid Camber,Centered,Feels Normal,Average,Good,Good,Great,Good,Great,Good,Average,Great
1,Freestyle,Intermediate - Expert,True Twin,Hybrid Camber,Centered,Feels Normal,Average,Average,Average,Average,Good,Great,Good,Great,Good
2,Freeride,Expert,Tapered Directional,Traditional Camber,Setback over 20mm,Feels Heavy,Poor,Great,Excellent,Excellent,Poor,Average,Good,Poor,Poor
3,Alternative Freeride,Advanced - Expert,Tapered Directional,Directional Camber,Setback over 20mm,Feels Normal,Good,Great,Great,Great,Great,Poor,Average,Poor,Average
4,Freeride,Expert,Tapered Directional,Mostly Camber,Setback over 20mm,Feels Normal,Poor,Excellent,Excellent,Great,Average,Average,Average,Poor,Poor


In [None]:
# for col in filter_cols:
#     print(df_final[col].unique())

In [6]:
# df_final[filter_cols].get_dummies()
df_final_dummies = pd.get_dummies(df_final, columns=filter_cols)
df_final_dummies.shape

(424, 92)

In [7]:
meta_cols = ['board_name', 'url', 'Overall Rating',
             'Fits Boot size (US)', 'Manufactured in', 'year']

### ANNOY

In [11]:
f = df_final_dummies.shape[1] - len(meta_cols)

In [27]:
a = AnnoyIndex(f, 'angular')

In [28]:
for idx, row in df_final_dummies.drop(labels=meta_cols, axis=1).iterrows():
     a.add_item(idx, row.to_list())
#     print(row.values)

In [31]:
a.build(10)
a.save('../model/annoy_men_angular_20201216.ann')

True

In [33]:
u = AnnoyIndex(f, 'angular')
u.load('../model/annoy_men_angular_20201216.ann') # super fast, will just mmap the file

True

In [34]:
print(u.get_nns_by_item(173, 1000)) # will find the 1000 nearest neighbors

[173, 234, 177, 368, 205, 229, 240, 271, 283, 303, 306, 315, 417, 421, 423, 51, 108, 109, 145, 209, 293, 297, 316, 327, 332, 342, 351, 413, 22, 80, 81, 111, 118, 119, 154, 156, 169, 203, 264, 287, 318, 323, 335, 341, 350, 354, 369, 374, 385, 406, 411, 412, 414, 379, 0, 15, 18, 23, 24, 27, 32, 36, 59, 60, 61, 69, 74, 91, 120, 123, 124, 125, 133, 159, 163, 167, 176, 179, 237, 239, 241, 246, 247, 251, 276, 277, 280, 325, 340, 344, 355, 371, 390, 393, 38, 54, 78, 89, 96, 101, 103, 104, 105, 106, 115, 127, 149, 153, 165, 214, 248, 268, 281, 284, 288, 305, 309, 311, 320, 322, 329, 333, 339, 345, 346, 352, 362, 365, 376, 381, 407, 410, 1, 6, 12, 20, 25, 45, 50, 68, 70, 71, 77, 92, 94, 98, 107, 114, 121, 157, 168, 175, 180, 181, 184, 185, 188, 208, 216, 222, 230, 232, 244, 245, 250, 267, 278, 308, 321, 328, 349, 356, 361, 373, 377, 383, 403, 408, 7, 13, 14, 19, 28, 35, 37, 40, 48, 75, 93, 143, 148, 158, 166, 193, 201, 204, 206, 211, 213, 228, 235, 236, 238, 266, 270, 272, 274, 275, 282, 290, 2

### Pairwise distance

In [21]:
from scipy.spatial.distance import squareform, pdist

def distance_function(r):
    dists = pdist(r, 'jaccard')
    
    return pd.DataFrame(squareform(dists))

In [22]:
df_dist = distance_function(df_final_dummies.drop(labels=meta_cols, axis=1).values)

In [23]:
reference_index = df_final[df_final['board_name'].str.contains('Jones  Mountain Twin')].index[0]

In [24]:
target = df_dist.iloc[:, reference_index].nsmallest(6)
target = target.iloc[1:]
target

234    0.235294
177    0.333333
368    0.333333
205    0.421053
229    0.421053
Name: 173, dtype: float64

In [None]:
df_rec = df_final.iloc[taget_index, :]
df_rec['similarity'] =  target.values

In [None]:
df_rec