# Rename columns

The goal of this notebook is to rename columns and there values.

In [None]:
import glob
import pandas as pd

In [None]:
# Get all column key files.
files = glob.glob('resources/column-keys/*.csv')

In [None]:
# Fix the column-key for the symptom checklist. The symptom checklist does is so different
# from the other keys that is does not fit a common function.
def fix_symptom_checklist(df):
    """The key for the symptom checkist is formatted differently."""
    for index, row in df.iterrows():
        row['Question'] = row['When'] + '--' + row['Question']
    del df['When']
    return df

In [None]:
# Generate a single dataframe to map column shorthands to actual questions.
dfs = []
for key_file in files:
    df = pd.read_csv(key_file, header=1)
    if 'Value Scale' in df:
        df['Value Labels'] = df['Value Scale']
        del df['Value Scale']
    if 'Symptom_Checklist.csv' in key_file:
        df = fix_symptom_checklist(df)
    df = df.rename(columns={'Variable':'Variable Name', 'Value':'Values', 'Value Label':'Value Labels', 'Question ':'Question'})
    if df.columns.tolist() == ['Question', 'Variable Name', 'Variable Type', 'Values', 'Value Labels']:
        dfs.append(df)
    else:
        print ''
        print ''
        print ''
        print ''
        print key_file
        print df.columns.tolist()
all_keys = pd.concat(dfs)

In [None]:
# Show the mapping.
all_keys.head(100)

In [None]:
# remove initial numbering
def remove_number_prefix(name):
    split_name = name.split('.')
    if split_name[0].isdigit():
        return ''.join(split_name[1:]).strip()
    else:
        return name

In [None]:
# Remove funny ascii characters.
def encode_correctly(question):
    return question.decode('utf-8','ignore').encode("utf-8")
all_keys['Question'] = all_keys['Question'].apply(encode_correctly)

In [None]:
# Remove superfluous numbering.
for index, row in all_keys.iterrows():
    row['Question'] = remove_number_prefix(row['Question'])
    if row['Question'].split('Item ')[-1].isdigit():
        row['Question'] = row['Variable Name']

In [None]:
df = pd.read_csv('resources/questions_and_mri_structured_subsample.csv')

In [None]:
df = df.rename(columns=dict(zip(all_keys['Variable Name'], all_keys['Question'])))

In [None]:
# Rname diagnosis to somethign more readable.
df = df.rename(columns={'Anx':'Anxiety', 'ASD':'Autism Spectrum Disorder'})

In [None]:
# Replace binaries to readable gender descriptions.
def replace_binaries_gender(value):
    if value==0.0:
        return 'Male'
    elif value==1.0:
        return 'Female'
    else:
        return value
df['Sex'] = df['Sex'].apply(replace_binaries_gender)

In [None]:
# Save to a new .csv
df.to_csv('resources/questions_and_mri_structured_subsample_rename.csv', index=False)