In [73]:
import pandas as pd
import uuid
import json
import yaml
import random

df = pd.read_csv('../data/raw.csv')
meta = pd.read_csv('../data/feature_metadata.csv')

with open('../data/name_gen.yml') as f:
    names = yaml.safe_load(f)

df.rename(columns={k: v for k,v in meta[['SurveyKey', 'DataFrameKey']].values}, inplace=True)

In [78]:
distinct_values = {}
for col in df.columns:
    if col not in ('timestamp'):
        distinct_values[col] = pd.unique(df[col])

In [86]:
float('173.43')

173.43

In [79]:
def clean_height(value):
    value = value.replace('cm', '')
    try:
        value = float(value)
        if 220 > value > 100:
            return value
    except:
        return None
    
def clean_sex(value):
    return {
        'Male': 'M',
        'Female': 'F'
    }[value]

def clean_weight(value):
    value = value.replace('kg', '')
    try:
        value = float(value)
        if 100 > value > 30:
            return value
    except:
        return None

def clean_wingspan(value):
    value = value.replace('cm', '')
    try:
        value = float(value)
        if 220 > value > 100:
            return value
    except:
        return None 

def clean_Vgrade(value):
    return int(value[1:])

def clean_I_dont(value):
    empty = (
        "I don't boulder",
        "I don't climb routes", 
    )
    if value in empty:
        value = None
    return value

def clean_list(value):
    empty = (
        "I don't train for endurance",
        "I don't hangboard",
        "No other strength training"
    )
    if value in empty:
        value = None
    else:
        value = value.split(', ')
    return value

def clean_maxload(value):
    value = value.replace('kg', '')
    try:
        value = float(value)
        if 70 > value:
            return value
    except:
        return None
    
def clean_activities(value):
    cats = {
        'yoga': ['yoga', 'yogs', 'yoha'],
        'stretching': ['stretching'],
        'mobility': ['mobility'],
        'pilates': ['pilates'],
        '"cardio"': ['cardio', 'stairs', 'jump rope', 'hiit running'], 
        'trail running': ['trail running'],
        'running': ['treadmill', 'running', 'jogging'],
        'mountain biking': ['mountain biking', 'MTB'],
        'biking': ['cycling', 'bike', 'spinning', 'BMX'],
        'hiking': ['hiking'],
        'walking': ['walking'],
        'acrobatics': ['gymnastics', 'trapeze', 'acro', 'parkour'],
        'martial arts': ['muay thai', 'grappling'],
        'calisthenics': ['calisthenics', 'ring work'],
        'weight training': ['weight lifting', 'weights', 'strongman', 'crossfit'],
        'soccer': ['soccer'],
        'skiing': ['ski'],
        'surfing': ['surfing'],
        'swimming': ['swimming'],
        'none': ['nope', 'non', 'fuck that', '0', 'na', 'none']
    }
    convert_cat = {}
    for act, v in cats.items():
        for kw in v:
            convert_cat[kw] = act

    singletons = [
        'rowing', 'volley', 'basket', 'squash', 'badminton', 'ultimate frisbee',
        'core', 'dance', 'paddling', 'sailing', 'skateboard', 'discgolf', 'skating'
    ]

    acts = []
    for act in value.split(', '):
        if act in convert_cat.keys():
            acts.append(convert_cat[act])
    return acts


rules = {
    'sex': clean_sex,
    'height': clean_height,
    'weight': clean_weight,
    'wingspan': clean_wingspan,
    # 'hardest_boulder_ever': clean_Vgrade,
    # 'hardest_boulder_recent': clean_Vgrade,
    # 'hardest_boulder_confident': clean_Vgrade,
    # 'hardest_route'
    'hangboard_grips': clean_list,
    'hangboard_style': clean_list,
    # 'halfcrimp_maxload': clean_maxload,
    # 'opencrimp_maxload': clean_maxload,
    'endurance_style': clean_list,
    'strength_style': clean_list,
    'other_activities': clean_activities
}

dont_trust = ['halfcrimp_maxload', 'opencrimp_maxload', 'opencrimp_minedge']

{'sex': array(['Male', 'Female'], dtype=object),
 'height': array(['173', '180', '178', '184', '176', '186', '185', '175', '168',
        '157.5', '163', '174', '188', '164', '190', '177', '179', '185.4',
        '165.1', '157', '183', '167', '182', '170', '62', '193', '172.72',
        '195', '165', '182.8', '171', '160', '172', '187.96', '181',
        '1.68', '177.4', '191', '180.5', '153', '152', '194', '187', '154',
        '180.3', '192', '190.5', '198', '167cm', '189',
        '5 ft 8inches. Im amurican i dont know what centimeters are',
        '196', '159.38', '166', '182.88', '1.67', '159', '173 cm', '162.5',
        '172.7', '162', '177.8', '167.5', '155', '156', '169', '158',
        '182.9', '177.5', '1295', '175.26', '162.56', '167.64', '168.5',
        '185.3', '174.3', '110', '158.75', '201.1', '150', '8', '167.6',
        '1.75'], dtype=object),
 'weight': array(['77', '81', '67', '70', '84', '75', '88', '60', '80', '81.6', '54',
        '50.8', '65', '100', '53', '63.

In [56]:
name_bank = [adj+' '+noun for adj in names['front'] for noun in names['back']]
cids = []
for i in range(len(df)):
    cid = random.choice(name_bank)
    cids.append(cid)
    name_bank.remove(cid)

df['cid'] = cids
