In [7]:
import numpy as np
import pandas as pd
from scipy.stats.mstats import gmean
import plotly.express as px
import pickle
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ast
from bs4 import BeautifulSoup
import lxml
import cchardet

In [8]:
html_folder = 'C:\\Users\\Brayden\\Desktop\\Personal Website\\Brayden-L.github.io\\_includes\\linear_regression_routes\\'

In [9]:
df = pd.read_csv('All_Loc.csv')

# Basic Cleaning

In [10]:
# Create Route ID Column for joining and remove duplicates
if "Route ID" not in df.columns:
        df.insert(len(df.columns), "Route ID", "")
df["Route ID"] = df["URL"].apply(lambda x: int(x.split("/")[4]))

df.drop_duplicates('Route ID', inplace=True)

In [11]:
# Create list of sublocation values
if "Location List" not in df.columns:
    df.insert(len(df.columns), "Location List", "")
df["Location List"] = df["Location"].apply(lambda x: x.split(">"))
df['Location List'] = df['Location List'].apply(lambda x: [val.strip() for val in x ])

In [12]:
# Remove erronious pitch values
df.drop(df[df['Pitches']<0].index, inplace=True)

In [13]:
# For Avg Stars, -1 is equivalent to None. We will assign these a value of 
df.loc[df['Avg Stars']==-1, 'Avg Stars'] = 0
# df.drop(df[df['Avg Stars'].isna()].index, axis=0, inplace=True)

In [14]:
df.drop(df[df['Route Type'] == 'Boulder'].index, inplace=True)

# Length Value Cleaning

In [15]:
fig = px.histogram(df['Length'], range_x=[-100,4000], marginal='box', title='Length Histogram', width=800)
fig.update_layout(showlegend=False, title_x=0.5)
fig.update_xaxes(title="Length (ft.)", row=1,col=1)
# fig.write_html(html_folder + 'Length_Hist.html')

In [16]:
df['Length/Pitches'] = df['Length'] / df['Pitches']
fig = px.histogram(df['Length/Pitches'], range_x=[-100,4000], marginal='box', title='Length/Pitches Histogram', width=800)
fig.update_layout(showlegend=False, title_x=0.5)
fig.update_xaxes(title="Length/Pitches (ft/Pitch)", row=1,col=1)

In [17]:
# Handle negative and unreasonably low values
df.loc[df['Length']<0, 'Length'] = df.loc[df['Length']<0, 'Length'] * -1
df.loc[df['Length']<10, 'Length'] = None

In [18]:
# Drop ridge traverses and other erroneous values
df.drop(df[df['Length/Pitches']>260].index, inplace=True)

In [19]:
# Flag missing length values prior to imputation
if "Length Missing" not in df.columns:
    df.insert(len(df.columns), "Length Missing", "")
df.loc[df['Length'].isna(), 'Length Missing'] = True
df.loc[~df['Length'].isna(), 'Length Missing'] = False

In [20]:
pitch_metric = df[~df['Length'].isna()].groupby(['Pitches'])['Length'].median()
pitch_groupcount = df[~df['Length'].isna()].groupby(['Pitches'])['Length'].count()

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Bar(x=pitch_groupcount.index, y=1/pitch_groupcount.values, name='(1/n) | n = Group Count'), secondary_y=False)
fig.add_trace(go.Scatter(x=pitch_metric.index, y=pitch_metric.values, name='Global Median Length'), secondary_y=True)
fig.update_layout(width=800, title='Global Median of Length by Number of Pitches', title_x=0.5)
fig.update_xaxes(title_text="Number of Pitches")
fig.update_yaxes(title_text="Median Length (ft)", secondary_y=True)
fig.update_yaxes(title_text="(1/n) | n = Group Count", secondary_y=False)
fig.write_html(html_folder + 'Global_Med_Length_Plot.html')

In [21]:
pitch_metric = df[~df['Length'].isna()].groupby(['Pitches'])['Length'].median()
pitch_groupcount = df[~df['Length'].isna()].groupby(['Pitches'])['Length'].count()

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Bar(x=pitch_groupcount.index, y=1/pitch_groupcount.values, name='(1/n) | n = Group Count'), secondary_y=False)
fig.add_trace(go.Scatter(x=pitch_metric.index, y=pitch_metric.values/pitch_metric.index, name='Global Median Length'), secondary_y=True)
fig.update_layout(width=800, title='Global Median Length/Pitch of length by Number of Pitches', title_x=0.5)
fig.update_xaxes(title_text="Number of Pitches")
fig.update_yaxes(title_text="Median Len/Pitch (ft/pitch)", secondary_y=True)
fig.update_yaxes(title_text="(1/n) | n = Group Count", secondary_y=False)
fig.write_html(html_folder + 'Global_Med_Length_Pitch_Plot.html')

In [22]:
# We want a list of indices for routes with above and below a critical number of similar group counts.
# This requires some fineggling of types to set the indices of the groupby to our desired state.
comp_brkpt = 5 # Number of local comparable routes considered to be sufficient, otherwise use global value.

df['Location Leaf'] = df['Location List'].apply(lambda x: x[0])

loc_pitch_gby = df[df['Length'].isna()].groupby(['Location Leaf', 'Pitches']) # groupby object.
gby_list = list(loc_pitch_gby.groups.values()) # Extract group index values and turn from index to list.
gby_list_str = [str(list(gby_list[x])) for x in range(0,len(gby_list))] # Turn this list of lists into a list of strings, cannot set_axis to a list in next step.

no_lcomp_list = loc_pitch_gby['Route'].count().set_axis(gby_list_str).loc[lambda x: x<=comp_brkpt].index # Create count metric, and select by breakpoint condition.
no_lcomp_list = [ast.literal_eval(item) for item in no_lcomp_list] # Back to list of list.
no_lcomp_list = [item for sublist in no_lcomp_list for item in sublist] # Flatten list.

df_no_lcomp_subset = df.index.isin(no_lcomp_list)

# Assign those values with no comparable lengths the global median length
df.loc[df_no_lcomp_subset, 'Length'] = df.loc[df_no_lcomp_subset, 'Pitches'].transform(lambda x: pitch_metric[x])

In [23]:
# Now remaining NaN values are assumed to be entries with comparable lengths

# First we get the geometric mean for all entries with existing lengths as a sort of lookup table
gmean_grouped_df = df[~df['Length'].isna()].groupby(['Location Leaf', 'Pitches'])['Length'].apply(gmean).round(0).reset_index()

# Create a dataframe that is loosely a list of indexes and relevent groupby indexes for all entries that are still missing values
yes_lcomp_df = pd.DataFrame.from_dict(df[df['Length'].isna()].groupby(['Location Leaf', 'Pitches']).groups, orient='index')
# Some index tom-foolery
yes_lcomp_df['ind'] = yes_lcomp_df.values.tolist()
yes_lcomp_df.drop(yes_lcomp_df.columns[0:-1], axis=1, inplace=True)
yes_lcomp_df.reset_index(inplace=True)
# The groupby parameters are in a tuple, we extract them to their own columns
yes_lcomp_df['Location Leaf'] = [x[0] for x in yes_lcomp_df['index']]
yes_lcomp_df['Pitches'] = [x[1] for x in yes_lcomp_df['index']]

# Merge them
gmean_lcomp_df = gmean_grouped_df.merge(yes_lcomp_df, on=['Location Leaf', 'Pitches'], how='right')[['ind', 'Length']]

# Apply the values to their respective indices
for i in gmean_lcomp_df.index:
    list_ind = gmean_lcomp_df.loc[i, 'ind']
    list_ind = [val for val in list_ind if not math.isnan(val)] # remove nans
    df.loc[list_ind, 'Length'] = gmean_lcomp_df.loc[i,'Length']
    
# What remains are groups with enough common entries to pass our initial breakpoint, but all entries within the local group are NaN!
# Simply reapply the global median
df.loc[df['Length'].isna(), 'Length'] = df.loc[df['Length'].isna(), 'Pitches'].transform(lambda x: pitch_metric[x])

In [24]:
pd.to_pickle(df, 'length_fix.pkl')

In [25]:
df_all = pd.read_pickle('All_Loc.pkl')

In [None]:
# Create num_star_ratings
def get_num_star_ratings(res):
    if res is None:
        return None
    else:
        soup = BeautifulSoup(res, "lxml")
        num_star_ratings_html = soup.select_one('#route-stats > div.onx-stats-table > div > div:nth-child(1) > div > h3 > span')
        if num_star_ratings_html is None:
            num_star_rating = 0
        else:
            num_star_rating = int(num_star_ratings_html.text.replace(',', ''))
        if num_star_rating == -1:
            num_star_rating = 0
    return num_star_rating
df_all["Num Star Ratings"] = df_all["Re Statpage"].apply(get_num_star_ratings)

In [None]:
df_all = df_all.merge(df[['Route ID', 'Length', 'Length Missing']], how='inner', on='Route ID')

In [None]:
df_all.drop('Length_x', axis=1, inplace=True)

In [None]:
df_all.rename(columns={'Length_y': 'Length'}, inplace=True)
df_all = df_all[['Route', 'Location', 'URL', 'Avg Stars', 'Your Stars', 'Num Star Ratings', 'Route Type', 'Original Rating', 'Rating', 'Pitches', 'Length', 'Length Missing', 'Area Latitude', 'Area Longitude', 'Route ID', 'Risk', 'Base Location', 'Re Mainpage', 'Re Statpage', 'SP/MP', 'Route Ticks', 'Num Ticks', 'Num Tickers', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP', 'Tick Counts']]

In [None]:
pd.to_pickle(df_all, 'All_Loc_Cleaned.pkl')

# Strip less useful columns

In [None]:
# Strip Unwanted Data
df_all.drop(['URL', 'Your Stars', 'Original Rating', 'Re Mainpage', 'Re Statpage', 'Route Ticks', 'Tick Counts'], axis=1, inplace=True)

pd.to_pickle(df, 'All_Loc_Cleaned_Stripped.pkl')