# Rock Climbing Analysis and Recommendation System

This notebook collects rock climbing route data from Mountain Project. It performs analysis on this data and creates a recommendation system to predict how much a user would like a route. 

In [None]:
import xml.etree.ElementTree as ET
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.neighbors import NearestNeighbors
from lightfm.cross_validation import random_train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from scipy.sparse import coo_matrix

Mountain Project contains hundreds of thousands of climbs. Set the limits below for how many climbs to collect for the analysis section and for the recommendation algorithm section. The analysis data is mainly limited by the length of time to crawl the web pages. The recommendation algorithm is mainly limited by the memory and runtime speed.

In [None]:
number_of_climbs_data_to_collect = 25000
number_of_climbs_for_recommendation_system = 2500

This scans through the sitemap files of Mountain Project to collect a list of route ids. You must have the sitemap files already downloaded and stored in the same location as this notebook.

Input: Sitemap files

Output: List of valid climb ids

In [None]:
valid_climbs = []
#valid_users = []

sitemap_ids = list(range(0,100))

for sitemap_id in sitemap_ids:
    file = 'Site Maps/sitemap' + str(sitemap_id) + '.xml'
    tree = ET.parse(file)
    root = tree.getroot()
    for child in root:
        text = re.split('/',child[0].text)
        if len(text) > 3:
            if text[3] == 'route':
                valid_climbs.append(text[4])
            # Uncomment this to collect user ids as well
            #elif text[3] == 'user':
                #valid_users.append(text[4])

This collects the basic data about routes from the Mountain Project website

Input: List of valid climb ids

Output: CSV of climb data

In [None]:
s = requests.Session()

df = pd.DataFrame(columns=['Id', 'Name', 'Difficulty', 'Star Rating', 'Number of Ratings', 'Type', 'Page Views', 'Description'])

#double check no more processing is needed

if number_of_climbs_data_to_collect < len(valid_climbs):
    valid_climbs = valid_climbs[:number_of_climbs_data_to_collect]

for climb_id in valid_climbs:
    URL = 'https://www.mountainproject.com/route/' + str(climb_id) + '/'
    page = s.get(URL)
    
    main = BeautifulSoup(page.content, 'html.parser').find('body', id='body-climb').find('div', class_='main-content-container').find('div', class_='container-fluid').find('div', id='route-page').find('div', class_='row pt-main-content')

    top = main.find('div', class_='col-md-9 float-md-right mb-1')
    climb_name = top.find('h1').text.strip()
    difficulty_rating = top.find('h2', class_='inline-block mr-2')
    if (difficulty_rating.find('span', class_='rateYDS') is not None):
        YDS_difficulty_rating = difficulty_rating.find('span', class_='rateYDS').text[:-4]
        star_rating = re.split(' |\n',top.find('span', id='route-star-avg').find('span').find('a', class_='show-tooltip').find('span', id=('starsWithAvgText-'+str(climb_id))).text.strip())
        star_rating_numeric = star_rating[1]
        star_rating_people = star_rating[3]

        body = main.find('div', class_='col-md-9 main-content float-md-right').find('div', class_='row')

        info = body.find('div').find('div', class_='small mb-1').table.find_all('tr')
        info_type = info[0].find_all('td')[1].text.strip()
        info_page_views = re.split(' ',info[2].find_all('td')[1].text.strip())[0]

        description = body.find('div', class_='col-xs-12')
        description_text = description.find('div', class_='mt-2 max-height max-height-md-800 max-height-xs-600').find('div', class_='fr-view').text
        
        new_route = {'Id':climb_id, 'Name':climb_name, 'Difficulty':YDS_difficulty_rating, 'Star Rating':star_rating_numeric, 'Number of Ratings':star_rating_people, 'Type':info_type, 'Page Views':info_page_views, 'Description':description_text}
        df = df.append(new_route, ignore_index=True)
df.to_csv('route_data.csv')

This converts the climb data into a more useable format

Input: CSV of climb data

Output: CSV of processed climb data

In [None]:
route_data_processed = pd.read_csv('route_data.csv', index_col='Id').drop('Unnamed: 0', axis=1)

route_data_processed['Page Views']=route_data_processed['Page Views'].str.replace(',','')

route_data_processed = route_data_processed.astype({'Name': 'str', 'Difficulty': 'str', 'Star Rating': 'float64', 'Number of Ratings': 'str','Type': 'str','Page Views': 'int32','Description': 'str'})

#Converts Difficulty

#Removes difficulties containing V
route_data_processed = route_data_processed[route_data_processed['Difficulty'].str.contains('V') == False]
#Changes + to c and - to b
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('\+','c')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('\-','b')
#Changes /s to lower value
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('\/.','')
#Convets grade letters to integer system to make it easier for analysis
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('5\.', '')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('3rd', '-2')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('4th', '-1')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('Easy 5th', '0')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('10a', '10')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('10b', '11')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('10c', '12')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('10d', '13')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('11a', '14')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('11b', '15')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('11c', '16')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('11d', '17')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('12a', '18')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('12b', '19')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('12c', '20')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('12d', '21')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('13a', '22')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('13b', '23')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('13c', '24')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('13d', '25')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('14a', '26')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('14b', '27')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('14c', '28')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('14d', '29')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('15a', '30')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('15b', '31')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('15c', '32')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('15d', '33')
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].str.replace('[a-d]', '')

#Fixes Difficulty column to proper type
route_data_processed['Difficulty'] = route_data_processed['Difficulty'].astype('int32')

#Fixes Number of Ratings column to proper type
route_data_processed['Number of Ratings'] = route_data_processed['Number of Ratings'].str.replace(',', '')
route_data_processed['Number of Ratings'] = route_data_processed['Number of Ratings'].astype('int32')

#Breaks up the type column
type_data = route_data_processed['Type'].str.split(', ', expand=True)
route_data_processed['Trad'] = False
route_data_processed['Sport'] = False
route_data_processed['Top Rope'] = False
route_data_processed['Alpine'] = False
route_data_processed['Aid'] = False
route_data_processed['Boulder'] = False
route_data_processed['Snow'] = False
route_data_processed['Ice'] = False
route_data_processed['Mixed'] = False
route_data_processed['Length (m)'] = 0
route_data_processed['Pitches'] = 1
route_data_processed['Grade'] = 1

for index, row in type_data.iterrows():
    for item in row:
        if pd.notnull(item):
            if 'Trad' in item:
                route_data_processed.loc[index,'Trad'] = True
            elif 'Sport' in item:
                route_data_processed.loc[index,'Sport'] = True
            elif 'TR' in item:
                route_data_processed.loc[index,'Top Rope'] = True
            elif 'Alpine' in item:
                route_data_processed.loc[index,'Alpine'] = True
            elif 'Aid' in item:
                route_data_processed.loc[index,'Aid'] = True
            elif 'Boulder' in item:
                route_data_processed.loc[index,'Boulder'] = True
            elif 'Snow' in item:
                route_data_processed.loc[index,'Snow'] = True
            elif 'Ice' in item:
                route_data_processed.loc[index,'Ice'] = True
            elif 'Mixed' in item:
                route_data_processed.loc[index,'Mixed'] = True
            elif 'ft' in item:
                route_data_processed.loc[index,'Length (m)'] = re.split(' ',item)[2][1:]
            elif 'pitches' in item:
                route_data_processed.loc[index,'Pitches'] = re.split(' ',item)[0]
            elif 'Grade' in item:
                grade = re.split(' ',item)[1]
                if grade == 'I':
                    route_data_processed.loc[index,'Grade'] = 1
                elif grade == 'II':
                    route_data_processed.loc[index,'Grade'] = 2
                elif grade == 'III':
                    route_data_processed.loc[index,'Grade'] = 3
                elif grade == 'IV':
                    route_data_processed.loc[index,'Grade'] = 4
                elif grade == 'V':
                    route_data_processed.loc[index,'Grade'] = 5
                elif grade == 'VI':
                    route_data_processed.loc[index,'Grade'] = 6
                elif grade == 'VII':
                    route_data_processed.loc[index,'Grade'] = 7

route_data_processed['Length (m)'] = route_data_processed['Length (m)'].astype('int32')
route_data_processed['Pitches'] = route_data_processed['Pitches'].astype('int32')
route_data_processed['Grade'] = route_data_processed['Grade'].astype('int32')

route_data_processed = route_data_processed.drop(['Type'], axis=1)

route_data_processed.to_csv('route_data_processed.csv')

This outputs an exploratory analysis of the climb data

Input: CSV of processed climb data

Output: Basic Analysis

In [None]:
route_data_processed = pd.read_csv('route_data_processed.csv', index_col='Id')

print(route_data_processed.dtypes)
print(route_data_processed.info())

#Distribution Analysis
route_data_processed['Star Rating'].hist(bins=8)
plt.title('Star Rating Histogram')
plt.xlabel("Star Ratings")
plt.ylabel("Count")
plt.savefig('star_rating_histogram.png')
plt.clf()
route_data_processed['Number of Ratings'].hist(bins=20)
plt.title('Number of Ratings Histogram')
plt.xlabel("Number of Ratings")
plt.ylabel("Count")
plt.savefig('number_of_ratings_histogram.png')
plt.clf()
route_data_processed['Page Views'].hist(bins=20)
plt.title('Page Views Histogram')
plt.xlabel("Page Views")
plt.ylabel("Count")
plt.savefig('page_views_histogram.png')
plt.clf()
route_data_processed['Length (m)'].hist(bins=10)
plt.title('Length (m) Histogram')
plt.xlabel("Length (m)")
plt.ylabel("Count")
plt.savefig('length_histogram.png')
plt.clf()
route_data_processed['Difficulty'].value_counts().sort_index().plot.bar()
plt.title('Difficulty Breakdown')
plt.savefig('difficulty_breakdown.png')
plt.clf()
route_data_processed['Pitches'].value_counts().sort_index().plot.bar()
plt.title('Pitches Breakdown')
plt.savefig('pitches_breakdown.png')
plt.clf()
route_data_processed['Grade'].value_counts().sort_index().plot.bar()
plt.title('Grade Breakdown')
plt.savefig('grade_breakdown.png')
plt.clf()
route_data_processed['Trad'].value_counts().sort_index().plot.bar()
plt.title('Trad Breakdown')
plt.savefig('trad_breakdown.png')
plt.clf()
route_data_processed['Sport'].value_counts().sort_index().plot.bar()
plt.title('Sport Breakdown')
plt.savefig('sport_breakdown.png')
plt.clf()
route_data_processed['Top Rope'].value_counts().sort_index().plot.bar()
plt.title('Top Rope Breakdown')
plt.savefig('top_rope_breakdown.png')
plt.clf()
print(route_data_processed['Alpine'].value_counts())
print(route_data_processed['Aid'].value_counts())
print(route_data_processed['Boulder'].value_counts())
print(route_data_processed['Snow'].value_counts())
print(route_data_processed['Ice'].value_counts())
print(route_data_processed['Mixed'].value_counts())

corr = route_data_processed_normalized.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

This normalizes the climb data

Input: CSV of processed climb data

Output: CSV of normalized, processed climb data

In [None]:
route_data_processed = pd.read_csv('route_data_processed.csv', index_col='Id')

normalizer = StandardScaler()
route_data_processed_normalized = route_data_processed
columns_to_normalize = route_data_processed_normalized.columns.drop(['Name', 'Description','Star Rating'])
route_data_processed_normalized[columns_to_normalize] = normalizer.fit_transform(route_data_processed_normalized[columns_to_normalize])

route_data_processed_normalized.to_csv('route_data_processed_normalized.csv')

This performs linear and ridge regression to predict the overall star rating of a route

Input: CSV of normalized, processed climb data

Output: Prediction performance analysis

In [None]:
route_data_processed_normalized = pd.read_csv('route_data_processed_normalized.csv', index_col='Id')
route_data_features = route_data_processed_normalized.drop(['Name','Description','Star Rating'], axis=1)
route_data_rating = route_data_processed_normalized['Star Rating']
route_data_features_train,route_data_features_test,route_data_rating_train,route_data_rating_test = train_test_split(route_data_features, route_data_rating, test_size=0.2, random_state=np.random.RandomState(123))

print(route_data_features.columns)

lr = LinearRegression()

lr.fit(route_data_features_train,route_data_rating_train)

print("Linear Bias is " + str(lr.intercept_))
print("Linear Coefficients are " + str(lr.coef_))

lr_route_data_rating_test_pred = lr.predict(route_data_features_test)

print('Linear MAE: ' + str(mean_absolute_error(route_data_rating_test, lr_route_data_rating_test_pred)))
print('Linear MSE: ' + str(mean_squared_error(route_data_rating_test, lr_route_data_rating_test_pred)))
print('Linear RMSE: ' + str(mean_squared_error(route_data_rating_test, lr_route_data_rating_test_pred, squared=False)))

print('')
rr = Ridge()

rr.fit(route_data_features_train,route_data_rating_train)

print("Ridge Bias is " + str(rr.intercept_))
print("Ridge Coefficients are " + str(rr.coef_))

rr_route_data_rating_test_pred = rr.predict(route_data_features_test)

print('Ridge MAE: ' + str(mean_absolute_error(route_data_rating_test, rr_route_data_rating_test_pred)))
print('Ridge MSE: ' + str(mean_squared_error(route_data_rating_test, rr_route_data_rating_test_pred)))
print('Ridge RMSE: ' + str(mean_squared_error(route_data_rating_test, rr_route_data_rating_test_pred, squared=False)))

This collects the climb ratings of routes from the Mountain Project website

Input: List of valid climb ids

Output: CSV of climb ratings by user

In [None]:
s = requests.Session()

if number_of_climbs_for_recommendation_system < len(valid_climbs):
    valid_climbs = valid_climbs[:number_of_climbs_for_recommendation_system]

climb_ratings = pd.DataFrame(index=valid_climbs)
for climb_id in valid_climbs:
    URL = 'https://www.mountainproject.com/route/stats/' + str(climb_id) + '/'
    page = s.get(URL)
    main = BeautifulSoup(page.content, 'html.parser').find('body', id='body-climb').find('div', class_='main-content-container').find('div', class_='container-fluid').find('div', id='route-stats').find_all('div', class_='row')[1].find('div')
    if 'Star' in main.find('h3').text:
        rating = main.table.find_all('tr')
        for row in rating:
            columns = row.find_all('td')
            user = re.split('/',columns[0].find('a').get('href'))[4]
            stars = columns[1].find('span', class_="scoreStars").find_all('img')
            rating = 0
            if 'bombBlue' not in stars[0]['src']:
                rating = len(stars)
            climb_ratings.loc[climb_id, user] = rating
climb_ratings.to_csv('climb_ratings.csv', index_label='Id')

This generates an item-based recommendation system for the climb data

Input: CSV of climb ratings by user

Output: Performance data

In [None]:
climb_ratings = pd.read_csv('climb_ratings.csv', index_col='Id')

climb_ratings_train, climb_ratings_test = random_train_test_split(coo_matrix(climb_ratings.to_numpy()), test_percentage=0.2, random_state=np.random.RandomState(123))

climb_ratings_train = pd.DataFrame.sparse.from_spmatrix(climb_ratings_train, index=climb_ratings.index, columns=climb_ratings.columns).sparse.to_dense()
climb_ratings_test = pd.DataFrame.sparse.from_spmatrix(climb_ratings_test, index=climb_ratings.index, columns=climb_ratings.columns).sparse.to_dense()

#Temp Fix
climb_ratings_train = climb_ratings_train.replace(0, np.nan)
climb_ratings_test = climb_ratings_test.replace(0, np.nan)

#adds averages and replaces nans with row average
climb_ratings_train['avg'] = climb_ratings_train.mean(axis=1)
climb_ratings_train = climb_ratings_train.dropna(how='all').T.fillna(climb_ratings_train['avg'], axis=0).T.astype('int')

correlation = 1-pairwise_distances(climb_ratings_train, metric="cosine")

train_model = NearestNeighbors(n_neighbors=10)
train_model.fit(correlation)

neighbors_distance, neighbors_ind = train_model.kneighbors()
climb_neighbors = pd.DataFrame(np.append(neighbors_ind, neighbors_distance, axis=1), columns=['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10'], index=climb_ratings_train.index)

climb_predictions = []
climb_actual = []
climb_success = 0
climb_unable_to_estimate = 0

for climb_id, row in climb_ratings_test.iterrows():
    for user, rating in row.iteritems():
        if not pd.isnull(rating):
            predicted_rating = 0
            sum_of_sim = 0
            for x in range(1,11):
                if sum(climb_neighbors.index == climb_id) == 0:
                    break
                ngbh_id = int(climb_neighbors.loc[climb_id, 'N'+str(x)])
                nghb_rating = climb_ratings_train.iloc[ngbh_id].loc[user]
                if not pd.isnull(nghb_rating):
                    nghb_distance = climb_neighbors.loc[climb_id, 'D'+str(x)]
                    sum_of_sim += nghb_distance
                    predicted_rating += nghb_distance*(nghb_rating-climb_ratings_train.iloc[ngbh_id].loc['avg'])
            if (sum_of_sim != 0):
                predicted_rating = predicted_rating/sum_of_sim
                predicted_rating += climb_ratings_train.loc[climb_id, 'avg']
                climb_predictions.append(predicted_rating)
                climb_actual.append(rating)
                climb_success += 1
            else:
                climb_unable_to_estimate += 1

This generates a user-based recommendation system for the climb data

Input: CSV of climb ratings by user

Output: Performance data

In [None]:
user_ratings = pd.read_csv('climb_ratings.csv', index_col='Id').T

user_ratings_train, user_ratings_test = random_train_test_split(coo_matrix(user_ratings.to_numpy()), test_percentage=0.2, random_state=np.random.RandomState(123))

user_ratings_train = pd.DataFrame.sparse.from_spmatrix(user_ratings_train, index=user_ratings.index, columns=user_ratings.columns).sparse.to_dense()
user_ratings_test = pd.DataFrame.sparse.from_spmatrix(user_ratings_test, index=user_ratings.index, columns=user_ratings.columns).sparse.to_dense()

#Temp Fix
user_ratings_train = user_ratings_train.replace(0, np.nan)
user_ratings_test = user_ratings_test.replace(0, np.nan)

#adds averages and replaces nans with row average
user_ratings_train['avg'] = user_ratings_train.mean(axis=1)
user_ratings_train = user_ratings_train.dropna(how='all').T.fillna(user_ratings_train['avg'], axis=0).T.astype('int')

correlation = 1-pairwise_distances(user_ratings_train, metric="cosine")

train_model = NearestNeighbors(n_neighbors=10)
train_model.fit(correlation)

neighbors_distance, neighbors_ind = train_model.kneighbors()
user_neighbors = pd.DataFrame(np.append(neighbors_ind, neighbors_distance, axis=1), columns=['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10'], index=user_ratings_train.index)

user_predictions = []
user_actual = []
user_success = 0
user_unable_to_estimate = 0

for user_id, row in user_ratings_test.iterrows():
    for climb, rating in row.iteritems():
        if not pd.isnull(rating):
            predicted_rating = 0
            sum_of_sim = 0
            for x in range(1,11):
                if sum(user_neighbors.index == user_id) == 0:
                    break
                ngbh_id = int(user_neighbors.loc[user_id, 'N'+str(x)])
                nghb_rating = user_ratings_train.iloc[ngbh_id].loc[climb]
                if not pd.isnull(nghb_rating):
                    nghb_distance = user_neighbors.loc[user_id, 'D'+str(x)]
                    sum_of_sim += nghb_distance
                    predicted_rating += nghb_distance*(nghb_rating-user_ratings_train.iloc[ngbh_id].loc['avg'])
            if (sum_of_sim != 0):
                predicted_rating = predicted_rating/sum_of_sim
                predicted_rating += user_ratings_train.loc[user_id, 'avg']
                user_predictions.append(predicted_rating)
                user_actual.append(rating)
                user_success += 1
            else:
                user_unable_to_estimate += 1

This analyzes the performance of the recommendation systems

Input: Performance data

Output: Performance analysis

In [None]:
print('Climb MAE: ' + str(mean_absolute_error(climb_actual, climb_predictions)))
print('Climb MSE: ' + str(mean_squared_error(climb_actual, climb_predictions)))
print('Climb RMSE: ' + str(mean_squared_error(climb_actual, climb_predictions, squared=False)))
print('Climb Number Could Estimate: ' + str(climb_success))
print('Climb Number Could Not Estimate: ' + str(climb_unable_to_estimate))
climb_over_estimate = np.array(climb_predictions)-np.array(climb_actual)
plt.hist(climb_over_estimate, bins=4)
plt.xlabel('Deviation from Actual')
plt.title('Item Based Recommendation Performance')
plt.savefig('item_based_recommendation_performance.png')
plt.clf()

print('User MAE: ' + str(mean_absolute_error(user_actual, user_predictions)))
print('User MSE: ' + str(mean_squared_error(user_actual, user_predictions)))
print('User RMSE: ' + str(mean_squared_error(user_actual, user_predictions, squared=False)))
print('User Number Could Estimate: ' + str(user_success))
print('User Number Could Not Estimate: ' + str(user_unable_to_estimate))
user_over_estimate = np.array(user_predictions)-np.array(user_actual)
plt.hist(user_over_estimate, bins=4)
plt.xlabel('Deviation from Actual')
plt.title('User Based Recommendation Performance')
plt.savefig('user_based_recommendation_performance.png')
plt.clf()