In [None]:
import requests
import pandas as pd
import boto3
import os
from pathlib import Path
import time
from tqdm import tqdm
import re
import hashlib
import base64
from datetime import datetime

from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

# from sklearn.preprocessing import OneHotEncoder
# from annoy import AnnoyIndex

# from botocore.exceptions import ClientError

In [None]:
def get_tables(html_table, tag=['th', 'tr', 'td']):
    table_value = []


    # for my_table in tables:

    # You can find children with multiple tags by passing a list of strings
    rows = html_table.findChildren(tag)

    for row in rows:
        cells = row.findChildren(tag)
        for cell in cells:
            value = cell.string
            if value:
                table_value.append(value.strip())
                # print("The value in this cell is %s" % value)
            else:
                table_value.append("None")
    return dict(zip(table_value[::2], table_value[1::2]))

In [None]:
def parse_page(url):
    
    # get the html file
    page = requests.get(url)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')

        # get all the tables
        tables = soup.findChildren('table')

        # only the first 4 tables are useful
        tables = tables[:4]


        # UPPER LEFT TABLE
        table_ul = get_tables(tables[0])

        # UPPER RIGHT TABLE
        rows_html = tables[1].findAll("span", {"class": lambda x: x and x.startswith("rating")})
        rows = [x.get_text() for x in rows_html]
        table_ur = dict(zip(rows[::2], rows[1::2]))

#         # BUTTOM TABLES

#         table_b = {}
#         for t in tables[2:]:
#             l = get_tables(t, ['td', 'p', 'tr'])
#             table_b.update(l)


        all_items = {**table_ul, **table_ur}
    
    else:
        print(page.status_code)


    return all_items

In [None]:
def get_date(input_text):
    
    year = re.findall('[0-9]{4}', input_text)
    
    return year

In [None]:
def hashme(x):
    return base64.b64encode(hashlib.sha1(x).digest())

In [None]:
def get_all_boards(gender):
    
#     browser = webdriver.Chrome()
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get("https://thegoodride.com/snowboard-reviews/?{}=1".format(gender))
    time.sleep(1)

    button = driver.find_element_by_xpath('/html/body/div[5]/div/div/div/div[3]/form/div[1]/div[42]/a')
    driver.execute_script("arguments[0].click();", button)
    time.sleep(100)
    html = driver.page_source

#     driver.close()
    
    return html


In [None]:
def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
#     try:
    response = s3_client.upload_file(file_name, bucket, object_name)
#     exceptions ClientError as e:
#         logging.error(e)
#         return False
    return True

# Scrape the data

In [None]:
today = datetime.today().strftime('%Y%m%d')
genders = ['mens', 'womens']

for g in genders:
    
    raw_html = get_all_boards(g)
    
    soup = BeautifulSoup(raw_html, "html.parser")
    rows_html = soup.findAll("div", {"class": "board-reviews animate"})

    all_boards = []
        
    for board in rows_html:
        board_name = board.select('h4')[0].text.strip()
        review_url = board.select('a', href=True)[0]['href']
        all_boards.append([board_name, review_url])
        
    df_url = pd.DataFrame(all_boards, columns=['board_name', 'url'])
    print(df_url.shape)

    df_url.to_csv('../data/all_{}_boards.csv'.format(g), index=False)
    
    rating_list = []
    for url in tqdm(df_url['url']):
        d = parse_page(url)
        assert len(d) == 18
        rating_list.append(d)
        time.sleep(.1)
        
    df_rating = pd.DataFrame(rating_list)
    df_final = pd.concat([df_url, df_rating], axis=1)
    df_final['year'] = df_final['url'].apply(get_date)
    df_final['gender'] = g

In [None]:
df_final

In [None]:
df_final['id'] = df_final['url'].astype(str).str.encode('UTF-8').apply(hashme)
df_final['id'] = df_final['id'].apply(lambda x: x.decode('utf-8'))

In [None]:
df_final = df_final.dropna()

In [None]:
df_final.to_csv('../data/all_boards_board_{}.csv'.format(today), index=True)

# save to S3

In [None]:
create_bucket("snowboard-finder")

In [None]:
upload_file('../data/all_boards_board_{}.csv'.format(today), 
            "snowboard-finder",
            "all_boards_{}.csv".format(today))

# Board Recomendation

In [None]:
df_final = pd.read_csv('../data/all_boards_board_{}.csv'.format(today))
df_final.shape

In [None]:
filter_cols = ['Riding Style', 'Riding Level', 'Shape', 'Camber Profile',
       'Stance', 'Approx. Weight', 'Powder', 'Turning Experience', 'Carving',
       'Speed', 'Uneven Terrain', 'Switch', 'Jumps', 'Jibbing', 'Pipe', 'gender']

meta_cols = ['board_name', 'url', 'Overall Rating',
             'Fits Boot size (US)', 'Manufactured in', 'year', 'id']

In [None]:
# df_final[filter_cols].get_dummies()
df_final_dummies = pd.get_dummies(df_final, columns=filter_cols)
df_final_dummies.shape

### ANNOY

In [None]:
# f = df_final_dummies.shape[1] - len(meta_cols)

# a = AnnoyIndex(f, 'angular')

# for idx, row in df_final_dummies.drop(labels=meta_cols, axis=1).iterrows():
#      a.add_item(idx, row.to_list())
# #     print(row.values)

# a.build(10)
# a.save('../model/annoy_all_angular_20201223.ann')

# upload_file('../model/annoy_all_angular_20201223.ann', "snowboard-finder", "/model/annoy_all_angular_20201223.ann")

# u = AnnoyIndex(f, 'angular')
# u.load('../model/annoy_all_angular_20201223.ann') # super fast, will just mmap the file

# print(u.get_nns_by_item(173, 20)) # will find the 1000 nearest neighbors

### Pairwise distance

In [None]:
from scipy.spatial.distance import squareform, pdist

def distance_function(r):
    dists = pdist(r, 'jaccard')
    
    return pd.DataFrame(squareform(dists))

In [None]:
df_dist = distance_function(df_final_dummies.drop(labels=meta_cols, axis=1).values)

### add similar boards to each rows

In [None]:
sim_boards = []
for idx, row in df_dist.iterrows():
    
    target = df_dist.iloc[:, idx].nsmallest(10)
    df_target = pd.merge(df_final['id'][target.index].reset_index(), target.reset_index(), on='index')
    df_target.columns = ['index', 'id', 'sim_value']
    sim_dict = pd.Series(df_target['sim_value'].values, index=df_target['id']).to_dict()
    sim_boards.append(sim_dict)

In [None]:
df_final['similar_boards'] = sim_boards

In [None]:
df_final.to_csv('../data/all_boards_similarity_20201222.csv', index=False)

In [None]:
upload_file('../data/all_boards_similarity_20201222.csv',
            "snowboard-finder",
            "all_boards_similarity_20201222.csv")

In [None]:
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

In [None]:
import boto3
ddb = boto3.resource('dynamodb')
table = ddb.Table('SnowboardDatabase')
attrs = table.attribute_definitions

In [None]:
attrs

In [None]:
table.