# Adding Further Covariates to Overall Dataset

### Sources:




## Adding Further  Covariates

This adds several covariates that will be used in the formal analysis and sensitivity analysis. 

In [None]:
import pyspark
import dxpy
import dxdata
import pandas as pd
import os

from pyspark.sql.functions import when, concat_ws
from re import sub

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
dispensed_database_name = dxpy.find_one_data_object(classname = "database",
    name = "app*", folder = "/",
    name_mode = "glob",
    describe = True)["describe"]["name"]

dispensed_dataset_id = dxpy.find_one_data_object(typename = "Dataset",
 name = "app*.dataset",
 folder = "/",
 name_mode = "glob")["id"]



In [None]:
dataset = dxdata.load_dataset(id = dispensed_dataset_id)

In [None]:
participant = dataset["participant"]

In [None]:

cohort = dxdata.load_cohort("Processed Cohort ALL ANCESTRIES")


field_names = ["eid", "p6142_i0", "p6142_i1", "p6142_i2", "p6142_i3", "p189", "p1289_i0", "p1289_i1",
      "p1289_i2", "p1289_i3", "p1299_i0", "p1299_i1", "p1299_i2", "p1299_i3", "p1309_i0",
      "p1309_i1", "p1309_i2", "p1309_i3", "p1319_i0", "p1319_i1", "p1319_i2", "p1319_i3",
      "p1349_i0", "p1349_i1", "p1349_i2", "p1349_i3", "p1478_i0", "p1478_i1", "p1478_i2",
      "p1478_i3", "p1329_i0", "p1329_i1", "p1329_i2", "p1329_i3", "p1558_i0", "p1558_i1",
      "p1558_i2", "p1558_i3", "p1160_i0", "p1160_i1", "p1160_i2", "p1160_i3", "p20116_i0",
      "p20116_i1", "p20116_i2", "p20116_i3", "p20161_i0", "p20161_i1", "p20161_i2", "p20161_i3",
      "p20107_i0", "p20107_i1", "p20107_i2", "p20107_i3", "p20110_i0", "p20110_i1", "p20110_i2",
      "p20110_i3", "p120098", "p6177_i0", "p6177_i1",
      "p6177_i2", "p6177_i3", "p22032_i0", "p22039_i0", "p22038_i0", "p22040_i0", "p816_i0", "p816_i1", "p816_i2", "p816_i3", "p806_i0", "p806_i1", 
"p806_i2", "p806_i3", "p21000_i0", "p21000_i1", "p21000_i2", "p104920_i0", "p104920_i1", "p104920_i2", "p104920_i3", "p104920_i4", "p104910_i0", "p104910_i1", "p104910_i2", "p104910_i3", "p104910_i4", "p104900_i0", "p104900_i1", "p104900_i2", "p104900_i3", 
               "p104900_i4", "p894_i0", "p894_i1", "p894_i2", "p894_i3", "p914_i0", "p914_i1", "p914_i2", "p914_i3"]

In [None]:
df = participant.retrieve_fields(names = field_names,
 filter_sql = cohort.sql,
 coding_values = "replace",
 engine = dxdata.connect())


In [None]:
df_pandas = df.toPandas()
df_pandas.head()

In [None]:
df_pandas.to_csv("FINALCovarsforCAD.csv")

In [None]:
# Bash Kernel
dx upload FINALCovarsforCAD.csv

In [None]:
dx download FINALCovarsforCAD.csv
dx download FINALANALYSISDATAPAPER3.csv

In [None]:
# In R Kernel
PADATA <- read.csv("FINALANALYSISDATAPAPER3.csv")
COVARS <- read.csv("FINALCovarsforCAD.csv")

In [None]:
# Showing for subset of variables but process is the same for all
field_names <- c("X", "eid", "p22032_i0", "p22039_i0", "p22038_i0", "p22040_i0", "p816_i0", "p816_i1", "p816_i2", "p816_i3", "p806_i0", "p806_i1", 
                 "p806_i2", "p806_i3", "p21000_i0", "p21000_i1", "p21000_i2")



# Making sure order is same between original field_names and COVARS dataset BEFORE renaming
MATCH <- match(field_names, colnames(COVARS))


# Provided this works then:
covarvector <- c("X", "eid", "IPAQGroupInst1", "METVigorous", "METMVPA", "METTotal", "ManLabor_Inst0", "ManLabor_Inst1", "ManLabor_Inst2", "ManLabor_Inst3","WalkorStandWork_Inst0", "WalkorStandwork_Inst1", 
                 "WalkorStandWork_Inst2", "WalkorStandWork_Inst3", "Ethnicity_Inst0", "Ethnicity_Inst1", "Ethnicity_Inst2")


colnames(COVARS) <- covarvector

# Dropping unnecessary x variable
COVARS <- COVARS[ , -1]


colnames(PADATA)
dim(PADATA)
# 77474 x 67

# ALSO NOTE there are some unprocessed redundancies in PADATA (un-renamed covars that overlap w accelerometer)
# UsedinPCA merging to get ONLY those that make kinship criteria
FINALDATAMERGE <- merge(PADATA, COVARS, by = "eid", all = F)

dim(FINALDATAMERGE)
# 77474 x 957


In [None]:
write.csv(FINALDATAMERGE, "FINALIZEDPADATASET.csv")

In [None]:
# In Bash Kernel
dx upload FINALIZEDPADATASET.csv