# AllOfUs: Carrier Counts for GBA1
- **Project:** Large-scale Genetic Characterization of PD in the AFR and AAC
- **Last updated:** December 2024
- **Version:** Bash and Python 3.9
- **Data:** AllofUs

## Summary
Get the sample IDs of individuals with PD from condition files

In [1]:
!head -1 pd_condition_df.csv
!grep "Parkinson's disease" pd_condition_df.csv | cut -f1 -d ","| sort -u | uniq | wc -l
!grep "Parkinson's disease" pd_condition_df.csv | cut -f1 -d ","| sort -u | uniq > all_PD_cases1422

person_id,condition_concept_id,standard_concept_name,standard_concept_code,standard_vocabulary,condition_start_datetime,condition_end_datetime,condition_type_concept_id,condition_type_concept_name,stop_reason,visit_occurrence_id,visit_occurrence_concept_name,condition_source_value,condition_source_concept_id,source_concept_name,source_concept_code,source_vocabulary,condition_status_source_value,condition_status_concept_id,condition_status_concept_name
1422


# Obtain the genetically predicted ancestry for PD cases

In [2]:
!gsutil -u $GOOGLE_PROJECT -m cp gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv .

Copying gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv...
\ [1/1 files][ 96.7 MiB/ 96.7 MiB] 100% Done                                    
Operation completed over 1 objects/96.7 MiB.                                     


In [3]:
!for i in `cat all_PD_cases1422` ; do grep -w $i ancestry_preds.tsv >> pd_ancestry ; done

In [4]:
!cut -f1,2 pd_ancestry | grep afr | wc -l
!cut -f1,2 pd_ancestry | grep afr | cut -f1 > pd_african_caseIDs

93


# Download exonic vcf files

In [None]:
!for i in {1..22} X ; do gsutil -u $GOOGLE_PROJECT -m \
!cp gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome_v7.1/vcf/exome.chr"$i".vcf.bgz* . ; done

# Extract PD cases and controls and split multiallelic variants into multiple rows

In [None]:
#cases
!for i in {1..22} X ; do bcftools view -S pd_african_caseIDs exome.chr"$i".vcf.bgz --force-samples \
!bcftools norm -m -any --threads 6 > exome.chr"$i"_african_split.vcf ; done
#controls
!for i in {1..22} X ; do bcftools view -S pd_african_controlIDs exome.chr"$i".vcf.bgz --force-samples \
!bcftools norm -m -any --threads 6 > exome.chr"$i"_african_controls_split.vcf ; done

# Extract variants present at least in ince case, and not frequent than 0.01 in the general AoU cohort.

In [None]:
!for i in {1..22} X ; do grep -v contig exome.chr"$i"_african.vcf_splited.vcf | grep PASS |\
!cut -f1-10 | grep chr"$i" | \
!grep -v "AC=0;" | egrep -v "AF=0.(9|8|7|6|5|4|3|2|1|09|08|07|06|05|04|02|01)" >> all_africanpdvariants ; done

# Analysis of the intronic rs3115534-G variant in cases and controls

In [None]:
mt_wgs_path = os.getenv("WGS_ACAF_MULTI_HAIL_PATH")
mt_wgs_path
mt_wgs_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold/multiMT/hail.mt"
mt = hl.read_matrix_table(mt_wgs_path)
test_intervals = ['chr1:155134452-155334452']
mt = hl.filter_intervals(
    mt,
    [hl.parse_locus_interval(x,)
     for x in test_intervals])
mt.count()
mt.locus.summarize()
mt = mt.filter_rows(mt.variant_qc.call_rate > 0.80, keep = True)
out_vcf = f'{bucket}/data/gba.vcf.bgz'
out_vcf
hl.export_vcf(mt, out_vcf, tabix=False)
!gunzip -c gba.vcf.bgz > gba.vcf

In [None]:
!./plink --vcf gba.vcf --chr 1 --from-bp 155235877 --to-bp 155235879 \
!--freq case-control --pheno pheno.txt --out chr1_155235878_G_T_afr_freq