# Combining Phenotypic and Genotypic Data

This code was run in the UK Biobank Research Analysis Platform to combine phenotypic and genotypic data. The results are only shown for chromosome 1 for simplicity.

In [None]:
dx download Chrom1Vars.csv
dx download FULLPHENODATA.csv


In [None]:
# To enable fast reading of large genomic datasets
install.packages("data.table")
library(data.table)

# To work with regular expressions
install.packages('stringr')
library(stringr)

In [None]:
# ----------------
# UKB Processing Dataset and Creating Final Variables
# ----------------

phenodata <- read.csv("FULLPHENODATA.csv")
genodata <- fread("Chrom1Vars.csv")


# Transposing genodata
genodatatranspose <- t(genodata)


# Renaming first column to ID for consistency with phenodata
colnames(genodatatranspose) <- c("ID", "1:47684677_", "1:49589847", "1:50559820_", "1:72751185", "1:75002193_", "1:78446761_", "1:96924097_", "1:110154688_", "1:177889480_", "1:201784287_")

# Comparing shape of IDs
as.factor(phenodata$eid)

# IF geno is eid_eid - NEED TO REMOVE SECOND PART
install.packages('stringr')
library(stringr)

head(phenodata$ID)

# ACTUAL code to remove second part for geno data
genodatatranspose$EID <- str_extract(genodatatranspose$ID, "[^_]+")
genodatatranspose$ID <- str_extract(genodatatranspose$ID, "[^_]+")
# ALSO made an EID version so I can ensure these line up at the end - just to doubly confirm

head(genodatatranspose$ID)
# AH so need to remove the first row here...


fulldata <- merge(phenodata, genodatatranspose, by = "ID", all = F)
# How many IDs left over? Is geno data still interpretable with variants as add'l columns?

dim(fulldata)
dim(phenodata)
dim(genodatatranspose)
# First is 256,982 x 127
# Second is 256,982 x 116
# Third is 487,410 x 12


# Save as csv
write.csv(fulldata, "UPDATEDPhenoGenoChrom1.csv")

In [None]:
dx upload UPDATEDPhenoGenoChrom1.csv
