In [2]:
import pandas as pd
import yaml
import random

df = pd.read_csv('../data/raw.csv')
meta = pd.read_csv('../data/feature_metadata.csv')

with open('../data/name_gen.yml') as f:
    names = yaml.safe_load(f)

df.rename(columns={k: v for k,v in meta[['SurveyKey', 'DataFrameKey']].values}, inplace=True)

def clean_height(value):
    value = value.replace('cm', '')
    try:
        value = float(value)
        if 220 > value > 100:
            return value
    except:
        return None
    
def clean_sex(value):
    return {
        'Male': 'M',
        'Female': 'F'
    }[value]

def clean_weight(value):
    value = value.replace('kg', '')
    try:
        value = float(value)
        if 100 > value > 30:
            return value
    except:
        return None

def clean_wingspan(value):
    value = str(value).replace('cm', '')
    try:
        value = float(value)
        if 220 > value > 100:
            return value
    except:
        return None 

def clean_Vgrade(value):
    return int(value[1:])

def clean_I_dont(value):
    empty = (
        "I don't boulder",
        "I don't climb routes", 
    )
    if value in empty:
        value = None
    return value

def clean_list(value):
    empty = (
        "I don't train for endurance",
        "I don't hangboard",
        "No other strength training"
    )
    if value in empty or not isinstance(value, str):
        value = None
    else:
        value = value.split(', ')
    return value

def clean_maxload(value):
    value = str(value).replace('kg', '')
    try:
        value = float(value)
        if 70 > value:
            return value
    except:
        return None
    
def clean_activities(value):
    cats = {
        'yoga': ['yoga', 'yogs', 'yoha'],
        'stretching': ['stretching'],
        'mobility': ['mobility'],
        'pilates': ['pilates'],
        'cardio': ['cardio', 'stairs', 'jump rope', 'hiit running'], 
        'trail running': ['trail running'],
        'running': ['treadmill', 'running', 'jogging'],
        'mountain biking': ['mountain biking', 'MTB'],
        'biking': ['cycling', 'bike', 'spinning', 'BMX'],
        'hiking': ['hiking'],
        'walking': ['walking'],
        'acrobatics': ['gymnastics', 'trapeze', 'acro', 'parkour'],
        'martial arts': ['muay thai', 'grappling'],
        'calisthenics': ['calisthenics', 'ring work'],
        'weight training': ['weight lifting', 'weights', 'strongman', 'crossfit'],
        'soccer': ['soccer'],
        'skiing': ['ski'],
        'surfing': ['surfing'],
        'swimming': ['swimming'],
        # 'none': ['nope', 'non', 'fuck that', '0', 'na', 'none']
    }
    convert_cat = {}
    for act, v in cats.items():
        for kw in v:
            convert_cat[kw] = act

    singletons = [
        'rowing', 'volley', 'basket', 'squash', 'badminton', 'ultimate frisbee',
        'core', 'dance', 'paddling', 'sailing', 'skateboard', 'discgolf', 'skating'
    ]

    acts = []
    if isinstance(value, str):
        value = value.lower()
        for kw in convert_cat.keys():
            if kw in value:
                acts.append(convert_cat[kw])
                value.replace(kw, '')
    
    if acts == []:
        acts = None
    
    return acts

def clean_maxcount(value):
    try:
        value = int(value)
        if 100 > value:
            return value
    except:
        return None

rules = {
    'sex': clean_sex,
    'height': clean_height,
    'weight': clean_weight,
    'wingspan': clean_wingspan,
    # 'hardest_boulder_ever': clean_Vgrade,
    # 'hardest_boulder_recent': clean_Vgrade,
    # 'hardest_boulder_confident': clean_Vgrade,
    'hangboard_grips': clean_list,
    'hangboard_style': clean_list,
    'halfcrimp_maxload': clean_maxload,
    'opencrimp_maxload': clean_maxload,
    'endurance_style': clean_list,
    'strength_style': clean_list,
    'other_activities': clean_activities,
    'pullups_maxcount': clean_maxcount,
    'pullups_maxload': clean_maxload,
    'pushups_maxcount': clean_maxcount,
    # 'lsit_maxtime': 
}

dont_trust = ['halfcrimp_maxload', 'halfcrimp_minedge', 'opencrimp_maxload', 'opencrimp_minedge', 'pullups_maxload', 'lsit_maxtime']
df.drop(columns=dont_trust, inplace=True)
for col in df.columns:
    if col in rules:
        df[col] = [rules[col](val) for val in df[col].values]

name_bank = [adj+' '+noun for adj in names['front'] for noun in names['back']]
cids = []
for i in range(len(df)):
    cid = random.choice(name_bank)
    cids.append(cid)
    name_bank.remove(cid)

df['cid'] = cids



In [25]:
def bin(series, n_bins, unit):
    min = series.min()
    max = series.max()
    bin_size = (max - min)/n_bins
    levels = [min+n*bin_size for n in range(n_bins+1)]

    bins = []

    for i in range(len(levels)):
        if i < len(levels)-1:
            start = int(levels[i])
            end = int(levels[i+1])

            bins.append(f"{start}-{end} {unit}")

    bin_map = {}
    for i in series:
        if i==i:
            bindex = int((i-min)/bin_size)
            if bindex >= n_bins:
                bindex = n_bins-1
            bin_map[i] = bins[bindex]

    return bin_map

bin_me = [
    ('height', 'cm'),
    ('weight', 'kg'),
    ('wingspan', 'cm'),
    ('pullups_maxcount', 'reps'),
    ('pushups_maxcount', 'reps')
]
for attr, unit in bin_me:
    bin_map = bin(df[attr], 8, unit)
    df[attr+'_binned'] = [bin_map[i] if i==i else None for i in df[attr].values]

In [26]:
df.to_csv('../data/clean.csv', index=False)