In [73]:
import pandas as pd
import uuid
import json
import yaml
import random



In [103]:
df = pd.read_csv('../data/raw.csv')
meta = pd.read_csv('../data/feature_metadata.csv')

with open('../data/name_gen.yml') as f:
    names = yaml.safe_load(f)

df.rename(columns={k: v for k,v in meta[['SurveyKey', 'DataFrameKey']].values}, inplace=True)

def clean_height(value):
    value = value.replace('cm', '')
    try:
        value = float(value)
        if 220 > value > 100:
            return value
    except:
        return None
    
def clean_sex(value):
    return {
        'Male': 'M',
        'Female': 'F'
    }[value]

def clean_weight(value):
    value = value.replace('kg', '')
    try:
        value = float(value)
        if 100 > value > 30:
            return value
    except:
        return None

def clean_wingspan(value):
    value = str(value).replace('cm', '')
    try:
        value = float(value)
        if 220 > value > 100:
            return value
    except:
        return None 

def clean_Vgrade(value):
    return int(value[1:])

def clean_I_dont(value):
    empty = (
        "I don't boulder",
        "I don't climb routes", 
    )
    if value in empty:
        value = None
    return value

def clean_list(value):
    empty = (
        "I don't train for endurance",
        "I don't hangboard",
        "No other strength training"
    )
    if value in empty or not isinstance(value, str):
        value = None
    else:
        value = value.split(', ')
    return value

def clean_maxload(value):
    value = str(value).replace('kg', '')
    try:
        value = float(value)
        if 70 > value:
            return value
    except:
        return None
    
def clean_activities(value):
    cats = {
        'yoga': ['yoga', 'yogs', 'yoha'],
        'stretching': ['stretching'],
        'mobility': ['mobility'],
        'pilates': ['pilates'],
        'cardio': ['cardio', 'stairs', 'jump rope', 'hiit running'], 
        'trail running': ['trail running'],
        'running': ['treadmill', 'running', 'jogging'],
        'mountain biking': ['mountain biking', 'MTB'],
        'biking': ['cycling', 'bike', 'spinning', 'BMX'],
        'hiking': ['hiking'],
        'walking': ['walking'],
        'acrobatics': ['gymnastics', 'trapeze', 'acro', 'parkour'],
        'martial arts': ['muay thai', 'grappling'],
        'calisthenics': ['calisthenics', 'ring work'],
        'weight training': ['weight lifting', 'weights', 'strongman', 'crossfit'],
        'soccer': ['soccer'],
        'skiing': ['ski'],
        'surfing': ['surfing'],
        'swimming': ['swimming'],
        'none': ['nope', 'non', 'fuck that', '0', 'na', 'none']
    }
    convert_cat = {}
    for act, v in cats.items():
        for kw in v:
            convert_cat[kw] = act

    singletons = [
        'rowing', 'volley', 'basket', 'squash', 'badminton', 'ultimate frisbee',
        'core', 'dance', 'paddling', 'sailing', 'skateboard', 'discgolf', 'skating'
    ]

    acts = []
    if isinstance(value, str):
        for act in value.split(', '):
            for kw in convert_cat.keys():
                if act in kw:
                    acts.append(convert_cat[kw])
    
    if acts == ['none'] or acts == []:
        acts = None
    
    return acts

def clean_maxcount(value):
    try:
        value = int(value)
        return value
    except:
        return None

rules = {
    'sex': clean_sex,
    'height': clean_height,
    'weight': clean_weight,
    'wingspan': clean_wingspan,
    # 'hardest_boulder_ever': clean_Vgrade,
    # 'hardest_boulder_recent': clean_Vgrade,
    # 'hardest_boulder_confident': clean_Vgrade,
    'hangboard_grips': clean_list,
    'hangboard_style': clean_list,
    'halfcrimp_maxload': clean_maxload,
    'opencrimp_maxload': clean_maxload,
    'endurance_style': clean_list,
    'strength_style': clean_list,
    'other_activities': clean_activities,
    'pullups_maxcount': clean_maxcount,
    'pullups_maxload': clean_maxload,
    'pushups_maxcount': clean_maxcount,
    # 'lsit_maxtime': 
}

dont_trust = ['halfcrimp_maxload', 'opencrimp_maxload', 'opencrimp_minedge', 'pullups_maxload', 'lsit_maxtime']
df.drop(columns=dont_trust, inplace=True)
for col in df.columns:
    if col in rules:
        df[col] = [rules[col](val) for val in df[col].values]


In [104]:
df

Unnamed: 0,timestamp,sex,height,weight,wingspan,years_climbing,indoor_outdoor,hardest_boulder_ever,hardest_boulder_recent,hardest_boulder_confident,...,weekly_count_campus,weekly_hours_campus,weekly_count_endurance,endurance_style,weekly_count_strength,session_hours_strength,strength_style,other_activities,pullups_maxcount,pushups_maxcount
0,29/01/2017 20:12:46,M,173.0,77.0,178.0,4.5 - 5 years,Indoor and outdoor climbing,V8,V8,V6,...,0,0,1,[4x4],3,4,"[Antagonists, Legs, Core]",,15.0,40.0
1,29/01/2017 20:17:27,M,180.0,81.0,180.0,3 - 3.5 years,Indoor Climbing only,V3,V3,V1,...,0,0,1,[Laps of routes],2,2,"[Antagonists, Legs, Core, Upper body pulling, ...",[stretching],11.0,24.0
2,29/01/2017 20:28:14,M,178.0,67.0,175.0,.5 - 1 years,Indoor and outdoor climbing,V7,V6,V5,...,0,0,2,"[4x4, ARC, route climbing intervals]",3,2,"[Antagonists, Core, Upper body pulling, Upper ...",[soccer],17.0,
3,29/01/2017 20:51:08,M,173.0,70.0,178.0,9 - 9.5 years,Indoor and outdoor climbing,V5,V4,V3,...,0,0,1,"[Laps of routes, route climbing intervals]",0,0,"[Antagonists, Legs, Core, No other strength tr...",,8.0,30.0
4,29/01/2017 21:03:19,M,184.0,84.0,197.0,6.5 - 7 years,Indoor and outdoor climbing,V10,V10,V7,...,0,0,2,"[4x4, Max moves, threshold intervals]",2,1,"[Core, Upper body pushing]",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,14/11/2023 13:22:09,M,167.6,54.4,160.0,3 - 3.5 years,Indoor and outdoor climbing,V7,V6,V3,...,0,0,0,,5,5,"[Antagonists, Legs, Upper body pulling, Upper ...",,10.0,36.0
622,28/11/2023 05:37:05,M,,66.0,,.5 - 1 years,Indoor Climbing only,V7,V7,V5,...,1,1,2,"[Max moves, systems boards, other]",2,2,"[Antagonists, Legs, Core, Upper body pulling, ...",[biking],32.0,45.0
623,21/12/2023 23:29:51,M,196.0,75.0,199.0,3 - 3.5 years,Indoor Climbing only,V10,V10,V8,...,0,0,0,,0,0,,,23.0,50.0
624,02/01/2024 09:21:16,F,152.0,38.4,154.0,5.5 - 6 years,Indoor Climbing only,V7,V7,V5,...,0,0,2,"[4x4, Laps of routes, other]",2,1,"[Antagonists, Legs, Core, Upper body pulling, ...",,17.0,44.0


In [106]:
name_bank = [adj+' '+noun for adj in names['front'] for noun in names['back']]
cids = []
for i in range(len(df)):
    cid = random.choice(name_bank)
    cids.append(cid)
    name_bank.remove(cid)

df['cid'] = cids
