# Imports

In [1]:
using CSV
using DataFrames
using Dates
using CategoricalArrays
using Pipe: @pipe

In [2]:
ENV["COLUMNS"]=2200

2200

# Load Data

In [3]:
df = CSV.read("Data/frmgham2.csv", DataFrame)
first(df, 5)

Unnamed: 0_level_0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,BPMEDS,HEARTRTE,GLUCOSE,educ,PREVCHD,PREVAP,PREVMI,PREVSTRK,PREVHYP,TIME,PERIOD,HDLC,LDLC,DEATH,ANGINA,HOSPMI,MI_FCHD,ANYCHD,STROKE,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
Unnamed: 0_level_1,Int64,Int64,Int64?,Int64,Float64,Float64,Int64,Int64?,Float64?,Int64,Int64?,Int64?,Int64?,Int64?,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64?,Int64?,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,2448,1,195,39,106.0,70.0,0,0,26.97,0,0,80,77,4,0,0,0,0,0,0,1,missing,missing,0,0,1,1,1,0,1,0,8766,6438,6438,6438,8766,6438,8766,8766
2,2448,1,209,52,121.0,66.0,0,0,missing,0,0,69,92,4,0,0,0,0,0,4628,3,31,178,0,0,1,1,1,0,1,0,8766,6438,6438,6438,8766,6438,8766,8766
3,6238,2,250,46,121.0,81.0,0,0,28.73,0,0,95,76,2,0,0,0,0,0,0,1,missing,missing,0,0,0,0,0,0,0,0,8766,8766,8766,8766,8766,8766,8766,8766
4,6238,2,260,52,105.0,69.5,0,0,29.43,0,0,80,86,2,0,0,0,0,0,2156,2,missing,missing,0,0,0,0,0,0,0,0,8766,8766,8766,8766,8766,8766,8766,8766
5,6238,2,237,58,108.0,66.0,0,0,28.5,0,0,80,71,2,0,0,0,0,0,4344,3,54,141,0,0,0,0,0,0,0,0,8766,8766,8766,8766,8766,8766,8766,8766


# Processing

In [4]:
# Label the apporpriate sex
sexDict = Dict(1=>"Male", 2=>"Female")
df.SEX = [sexDict[key] for key in df.SEX]
# Label the period
periodDict = Dict(1=>"One", 2=>"Two", 3=>"Three")
df.PERIOD = [periodDict[key] for key in df.PERIOD];

In [5]:
first(df, 5)

Unnamed: 0_level_0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,BPMEDS,HEARTRTE,GLUCOSE,educ,PREVCHD,PREVAP,PREVMI,PREVSTRK,PREVHYP,TIME,PERIOD,HDLC,LDLC,DEATH,ANGINA,HOSPMI,MI_FCHD,ANYCHD,STROKE,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
Unnamed: 0_level_1,Int64,String,Int64?,Int64,Float64,Float64,Int64,Int64?,Float64?,Int64,Int64?,Int64?,Int64?,Int64?,Int64,Int64,Int64,Int64,Int64,Int64,String,Int64?,Int64?,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,2448,Male,195,39,106.0,70.0,0,0,26.97,0,0,80,77,4,0,0,0,0,0,0,One,missing,missing,0,0,1,1,1,0,1,0,8766,6438,6438,6438,8766,6438,8766,8766
2,2448,Male,209,52,121.0,66.0,0,0,missing,0,0,69,92,4,0,0,0,0,0,4628,Three,31,178,0,0,1,1,1,0,1,0,8766,6438,6438,6438,8766,6438,8766,8766
3,6238,Female,250,46,121.0,81.0,0,0,28.73,0,0,95,76,2,0,0,0,0,0,0,One,missing,missing,0,0,0,0,0,0,0,0,8766,8766,8766,8766,8766,8766,8766,8766
4,6238,Female,260,52,105.0,69.5,0,0,29.43,0,0,80,86,2,0,0,0,0,0,2156,Two,missing,missing,0,0,0,0,0,0,0,0,8766,8766,8766,8766,8766,8766,8766,8766
5,6238,Female,237,58,108.0,66.0,0,0,28.5,0,0,80,71,2,0,0,0,0,0,4344,Three,54,141,0,0,0,0,0,0,0,0,8766,8766,8766,8766,8766,8766,8766,8766


In [6]:
riskFactors = ["AGE_One",
               "PREVCHD_One",
               "BMI_One",
               "BPMEDS_One",
               "SYSBP_One",
               "CURSMOKE_One",
               "TOTCHOL_One",
               "DIABETES_One",
               "SEX_One",
               "TIME",
               "STROKE"]
nms = ["Age",
       "Prevalent CHD",
       "BMI",
       "Anti-Hypertensives",
       "Systolic blood pressure",
       "Smoke",
       "Cholesterol",
       "Diabetes",
       "Sex",
       "Time",
       "Stroke"]
tenYears = (365*8) + (366*2) # Two leap years in ten years

@time @pipe df |>
  filter(:RANDID => x -> !(x in [filter([:PREVSTRK, :PERIOD] => (x,y) -> x==1 && y=="One", df).RANDID]), _) |>
  outerjoin([unstack(_, :RANDID, :PERIOD, x, renamecols=i->Symbol(x*"_", i)) for x in filter(!in(("RANDID", "PERIOD")), names(df))]..., on=:RANDID) |>
  transform(_, 
            [:TIMESTRK_One, :STROKE_One] => ByRow((b, x) -> (b <= tenYears && b > 0 ? x : 0)) => :STROKE,
            :TIMESTRK_One => ByRow(x -> x <= tenYears ? x : tenYears) => :TIME
        ) |>
  select(_, riskFactors) |>
  rename(_, nms)

 18.745868 seconds (43.99 M allocations: 10.332 GiB, 3.43% gc time, 14.65% compilation time)


Unnamed: 0_level_0,Age,Prevalent CHD,BMI,Anti-Hypertensives,Systolic blood pressure,Smoke,Cholesterol,Diabetes,Sex,Time,Stroke
Unnamed: 0_level_1,Int64?,Int64?,Float64?,Int64?,Float64?,Int64?,Int64?,Int64?,String?,Int64,Int64
1,39,0,26.97,0,106.0,0,195,0,Male,3652,0
2,46,0,28.73,0,121.0,0,250,0,Female,3652,0
3,48,0,25.34,0,127.5,1,245,0,Male,3652,0
4,61,0,28.58,0,150.0,1,225,0,Female,2089,1
5,46,0,23.1,0,130.0,1,285,0,Female,3652,0
6,43,0,30.3,0,180.0,0,228,0,Female,3652,0
7,63,0,33.11,0,138.0,0,205,0,Female,3652,0
8,45,0,21.68,0,100.0,1,313,0,Female,3652,0
9,52,0,26.36,0,141.5,0,260,0,Male,3652,0
10,43,0,23.61,0,162.0,1,225,0,Male,3652,0
