# Adding Up Effect Alleles in Variants

This code was run in the UK Biobank Research Analysis Platform to convert variants from allele letters to 0/1/2 based on number of effect alleles. The results are shown only for chromosome 1 but the process is equivalent for the remaining chromosomes.

In [None]:
dx download UPDATEDPhenoGenoAssayPCA.csv


In [None]:
# ----------------
# UKB Processing Dataset and Creating Final Variables
# ----------------


# Getting dataset
fulldataclean <- read.csv("UPDATEDPhenoGenoAssayPCA.csv")


# --------
# Starting with first allele in order of chromosome
# Position might vary within chromosome a bit (Excel sorts by first num, while R does by Pos total value) - doesn't really matter
# CHANGED!! Actual position corresponds to rs977747
# Effect Allele = T
# --------

head(fulldataclean$X1.47684677)
# 1:47684677 or rs977747
# Effect Allele = T

as.factor(fulldataclean$X1.47684677)


fulldataclean$EffAllele1.47684677 <- ifelse(fulldataclean$X1.47684677 == "T/T", 2,
                                            ifelse(fulldataclean$X1.47684677 == "T/G", 1, 0))
# Appears that it is NEVER G/T, so this code should work - DOUBLE CHECK THIS!


summary(fulldataclean$EffAllele1.47684677)
summary(as.factor(fulldataclean$X1.47684677))
# ****Why are there 1,113 NAs here? Are these a casualty to imputation????
# SHOULD already be ONLY genotyped individuals... SEE if others have missingness and if it's in same individuals...




# ------------------------------
# STARTED DOING OTHER VARIANTS SEPARATELY
# ------------------------------

summary(fulldataclean$EffAllele1.47684677)
summary(as.factor(fulldataclean$X1.47684677))


nrow(fulldataclean)


174078+124659
298737/256982
# Checks out - Variant 1 works as it should




# ----
# Variant 2
# ----

head(fulldataclean$X1.110154688)
# 1:110154688 or rs17024393
# Effect Allele = C

as.factor(fulldataclean$X1.110154688)


fulldataclean$EffAllele1.110154688 <- ifelse(fulldataclean$X1.110154688 == "C/C", 2,
                                            ifelse(fulldataclean$X1.110154688 == "T/C", 1, 0))


summary(fulldataclean$EffAllele1.110154688)
summary(as.factor(fulldataclean$X1.110154688))


# ----
# Variant 3
# 1:177889480
# ----

head(fulldataclean$X1.177889480)
# 1:177889480 or rs543874
# Effect Allele = G

as.factor(fulldataclean$X1.177889480)


fulldataclean$EffAllele1.177889480 <- ifelse(fulldataclean$X1.177889480 == "G/G", 2,
                                              ifelse(fulldataclean$X1.177889480 == "A/G", 1, 0))


summary(fulldataclean$EffAllele1.177889480)
summary(as.factor(fulldataclean$X1.177889480))


# ----
# Variant 4
# 1:201784287
# ----

head(fulldataclean$X1.201784287)
# 1:201784287 or rs2820292
# Effect Allele = C

as.factor(fulldataclean$X1.201784287)


fulldataclean$EffAllele1.201784287 <- ifelse(fulldataclean$X1.201784287 == "C/C", 2,
                                             ifelse(fulldataclean$X1.201784287 == "A/C", 1, 0))


summary(fulldataclean$EffAllele1.201784287)
summary(as.factor(fulldataclean$X1.201784287))



# ----
# Variant 5
# 1:49589847
# ----

head(fulldataclean$X1.49589847)
# 1:49589847 or rs657452
# Effect Allele = A

as.factor(fulldataclean$X1.49589847)


fulldataclean$EffAllele1.49589847 <- ifelse(fulldataclean$X1.49589847 == "A/A", 2,
                                             ifelse(fulldataclean$X1.49589847 == "A/G", 1, 0))


summary(fulldataclean$EffAllele1.49589847)
summary(as.factor(fulldataclean$X1.49589847))



# ----
# Variant 6
# 1:50559820
# ----

head(fulldataclean$X1.50559820)
# 1:50559820 or rs11583200
# Effect Allele = C

as.factor(fulldataclean$X1.50559820)


fulldataclean$EffAllele1.50559820 <- ifelse(fulldataclean$X1.50559820 == "C/C", 2,
                                            ifelse(fulldataclean$X1.50559820 == "C/T", 1, 0))


summary(fulldataclean$EffAllele1.50559820)
summary(as.factor(fulldataclean$X1.50559820))



# ----
# Variant 7
# 1:72751185
# ----

head(fulldataclean$X1.72751185)
# 1:72751185 or rs3101336
# Effect Allele = C

as.factor(fulldataclean$X1.72751185)


fulldataclean$EffAllele1.72751185 <- ifelse(fulldataclean$X1.72751185 == "C/C", 2,
                                            ifelse(fulldataclean$X1.72751185 == "T/C", 1, 0))


summary(fulldataclean$EffAllele1.72751185)
summary(as.factor(fulldataclean$X1.72751185))




# ----
# Variant 8
# 1:75002193
# ----

head(fulldataclean$X1.75002193)
# 1:75002193 or rs12566985
# Effect Allele = G

as.factor(fulldataclean$X1.75002193)


fulldataclean$EffAllele1.75002193 <- ifelse(fulldataclean$X1.75002193 == "G/G", 2,
                                            ifelse(fulldataclean$X1.75002193 == "G/A", 1, 0))


summary(fulldataclean$EffAllele1.75002193)
summary(as.factor(fulldataclean$X1.75002193))




# ----
# Variant 9
# 1:78446761
# ----

head(fulldataclean$X1.78446761)
# 1:78446761 or rs12401738
# Effect Allele = A

as.factor(fulldataclean$X1.78446761)


fulldataclean$EffAllele1.78446761 <- ifelse(fulldataclean$X1.78446761 == "A/A", 2,
                                            ifelse(fulldataclean$X1.78446761 == "G/A", 1, 0))


summary(fulldataclean$EffAllele1.78446761)
summary(as.factor(fulldataclean$X1.78446761))



# ----
# Variant 10
# 1:96924097
# ----

head(fulldataclean$X1.78446761)
# 1:96924097 or rs11165643
# Effect Allele = T

as.factor(fulldataclean$X1.96924097)


fulldataclean$EffAllele1.96924097 <- ifelse(fulldataclean$X1.96924097 == "T/T", 2,
                                            ifelse(fulldataclean$X1.96924097 == "C/T", 1, 0))


summary(fulldataclean$EffAllele1.96924097)
summary(as.factor(fulldataclean$X1.96924097))




write.csv(fulldataclean, "UPDATEDFULLPROCDATASET.csv")

In [None]:
dx upload UPDATEDFULLPROCDATASET.csv
