In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/01-raw/dag.csv")
df

Unnamed: 0,SUBJID,SATMath,SATWriting,SATVerbal,GPA,hab1,hab2,hab3,hab4,state,gender,race,income,choice,accepted,school,numapply
0,884230,450,420,430,3.3,1,1,2,2,AL,Female,Black,"$25,000-$29,999",4.0,1.0,2192.0,9
1,884232,430,430,470,2.7,2,2,1,1,AL,Male,Black,"$150,000-$199,999",1.0,1.0,2192.0,6
2,884233,430,320,320,2.0,1,2,0,1,AL,Male,Black,"$40,000-$49,999",4.0,0.0,2192.0,4
3,884247,560,540,580,4.0,2,2,2,2,AL,Female,Black,"$100,000-$149,999",1.0,1.0,2192.0,9
4,884307,520,570,510,3.3,2,1,2,2,AL,Male,Black,"$10,000-$14,999",4.0,1.0,2192.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91259,886635,300,500,400,3.7,1,1,1,2,FL,Female,Two or more race/ethnicity,"$250,000 or more",1.0,1.0,1691.0,6
91260,886640,670,660,650,3.7,1,2,2,2,FL,Female,White,"$75,000-$99,999'",1.0,1.0,1691.0,3
91261,886642,610,780,740,4.0,1,2,2,1,FL,Female,Two or more race/ethnicity,"$60,000-$74,999",1.0,1.0,1691.0,9
91262,886648,580,510,550,3.3,2,2,2,1,FL,Male,White,"$60,000-$74,999",2.0,1.0,1691.0,6


In [3]:
habit_cols = ['hab1', 'hab2', 'hab3', 'hab4']
df['habits'] = np.sum(df[habit_cols], axis=1)
df.drop(habit_cols, axis=1, inplace=True)

In [4]:
def parse_income(string):
    """Parse income string and return the average value."""
    try:
        lower, upper = "".join(string.replace("$", "")).replace(",","_").split('-')
        return np.mean([ int(lower[:2]), int(upper[:2])+1 ]) * 1_000
    except ValueError:
        if "or more" in string:
            return 250_000
        elif "or less" in string:
            return 10_000

In [5]:
# recode income
df['income'] = df['income'].apply(lambda x: parse_income(x))

In [6]:
def normalize_col(col):
    """Normalize a column so it has mean 0 and std 1."""
    m = np.mean(col)
    std = np.std(col)
    norm_vals = (col - m) / std

    assert np.round( np.mean(norm_vals), 5) == 0, "Mean is not 0"
    assert np.round( np.std(norm_vals), 5) == 1, "Std is not 1"

    return norm_vals

In [7]:
df

Unnamed: 0,SUBJID,SATMath,SATWriting,SATVerbal,GPA,state,gender,race,income,choice,accepted,school,numapply,habits
0,884230,450,420,430,3.3,AL,Female,Black,27500.0,4.0,1.0,2192.0,9,6
1,884232,430,430,470,2.7,AL,Male,Black,17500.0,1.0,1.0,2192.0,6,6
2,884233,430,320,320,2.0,AL,Male,Black,45000.0,4.0,0.0,2192.0,4,4
3,884247,560,540,580,4.0,AL,Female,Black,12500.0,1.0,1.0,2192.0,9,8
4,884307,520,570,510,3.3,AL,Male,Black,12500.0,4.0,1.0,2192.0,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91259,886635,300,500,400,3.7,FL,Female,Two or more race/ethnicity,250000.0,1.0,1.0,1691.0,6,5
91260,886640,670,660,650,3.7,FL,Female,White,87500.0,1.0,1.0,1691.0,3,7
91261,886642,610,780,740,4.0,FL,Female,Two or more race/ethnicity,67500.0,1.0,1.0,1691.0,9,6
91262,886648,580,510,550,3.3,FL,Male,White,67500.0,2.0,1.0,1691.0,6,7


In [8]:
df['SAT'] = df['SATMath'] + df['SATVerbal'] + df['SATWriting']
cols_to_normalize = ["SATMath", "SATWriting", "SATVerbal", "GPA", "SAT"]
for col in cols_to_normalize:
    df[col] = normalize_col(df[col])

In [9]:
df['income'] = np.log10(df['income'])
df

Unnamed: 0,SUBJID,SATMath,SATWriting,SATVerbal,GPA,state,gender,race,income,choice,accepted,school,numapply,habits,SAT
0,884230,-1.652536,-1.777725,-1.719744,-0.688423,AL,Female,Black,4.439333,4.0,1.0,2192.0,9,6,-1.918234
1,884232,-1.845287,-1.681540,-1.324951,-2.113622,AL,Male,Black,4.243038,1.0,1.0,2192.0,6,6,-1.809764
2,884233,-1.845287,-2.739570,-2.805425,-3.776353,AL,Male,Black,4.653213,4.0,0.0,2192.0,4,4,-2.749835
3,884247,-0.592404,-0.623510,-0.239270,0.974309,AL,Female,Black,4.096910,1.0,1.0,2192.0,9,8,-0.544284
4,884307,-0.977906,-0.334957,-0.930158,-0.688423,AL,Male,Black,4.096910,4.0,1.0,2192.0,9,7,-0.833536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91259,886635,-3.098170,-1.008248,-2.015839,0.261710,FL,Female,Two or more race/ethnicity,5.397940,1.0,1.0,1691.0,6,5,-2.279800
91260,886640,0.467728,0.530704,0.451618,0.261710,FL,Female,White,4.942008,1.0,1.0,1691.0,3,7,0.540414
91261,886642,-0.110526,1.684918,1.339902,0.974309,FL,Female,Two or more race/ethnicity,4.829304,1.0,1.0,1691.0,9,6,1.082762
91262,886648,-0.399653,-0.912064,-0.535365,-0.688423,FL,Male,White,4.829304,2.0,1.0,1691.0,6,7,-0.688910


In [10]:
df.to_csv("../data/02-processed/normalized_data.csv", index=False)