This Python program takes GWAS data from /dors/capra_lab/users/yand1/PTB_evo_analysis/data/2018_05_29_PheWAS_and_GWAS/Zhang_et_al_PTB_10000/pre_top10000.tab
and PheWAS data from 
/dors/capra_lab/users/yand1/PTB_evo_analysis/data/2018_05_29_PheWAS_and_GWAS/PheWAS Catalog/phewas-catalog.csv 
and finds the intersection of the SNPs, and stored the results to the stored in the 
/dors/capra_lab/users/yand1/PTB_evo_analysis/results/06_11_2018/combined folder. It then splits the data into individual files based on the SNP, and the results are stored to the 
/dors/capra_lab/users/yand1/PTB_evo_analysis/results/06_11_2018/snp folder.

In [None]:
# Import pandas for data combining
import pandas as pd

# For csv file writing
import csv

In [None]:
# Open the GWAS catalog as a dataframe.
gwas_df = pd.read_table("/dors/capra_lab/users/yand1/PTB_evo_analysis/data/2018_05_29_PheWAS_and_GWAS/Zhang_et_al_PTB_10000/pre_top10000.tab")

# Get column of GWAS containing only the SNPs and convert to dataframe
gwas_snps_df = pd.DataFrame(gwas_df['snp'])

# Open the PheWAS catalog as a dataframe.
phewas_df = pd.read_csv("/dors/capra_lab/users/yand1/PTB_evo_analysis/data/2018_05_29_PheWAS_and_GWAS/PheWAS_Catalog/phewas-catalog.csv")


In [None]:
# Use inner join to combine the GWAS file with PheWAS file
combined_df = pd.merge(gwas_snps_df, phewas_df, how='inner', on='snp')

# Write the result to a csv file
combined_df.to_csv("/dors/capra_lab/users/yand1/PTB_evo_analysis/results/2018_06_11/combined/Combined_GWAS_PheWAS.csv", sep = '\t', index = False)

In [None]:
# Check to see which GWAS SNPs were not in the PheWAS
missing_snps_df = gwas_snps_df[~gwas_snps_df.snp.isin(phewas_df.snp)]

# Print out those SNPs that are missing
missing_snps_df.to_csv("/dors/capra_lab/users/yand1/PTB_evo_analysis/results/2018_06_11/combined/missing_snps.csv", sep = '\t', index = False)

In [None]:
# Count number of repetitions of each SNP and print to csv file.
repetitions_df = combined_df['snp'].value_counts()
repetitions_df.to_csv("/dors/capra_lab/users/yand1/PTB_evo_analysis/results/2018_06_11/combined/repetitions.csv", sep = '\t')

In [None]:
# Double Check results with sets
phewas_set = set(phewas_df["snp"])
gwas_set = set(gwas_df["snp"])
intersection_set = phewas_set.intersection(gwas_set)
with open("/dors/capra_lab/users/yand1/PTB_evo_analysis/results/2018_06_11/combined/set_output.csv", 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter = '\t')
    csv_writer.writerow(list(intersection_set))

In [None]:
# Split up the combined dataframe into separate ones by SNP
grouped_snps_df = combined_df.groupby(by = "snp")

# Within each SNP, sort by p-value, then odds, then phewas_string. Use mergesort starting from phewas_string, since
# mergesort preserves relative order.
for snp, group in grouped_snps_df:
    sorted_phewas_df = group.sort_values(by = "phewas_string", kind = "mergesort")
    sorted_odds_df = sorted_phewas_df.sort_values(by = "odds_ratio", ascending = False, kind = "mergesort")
    sorted_p_df = sorted_odds_df.sort_values(by = 'p', kind = "mergesort")
    # Write the result to a csv file
    sorted_p_df.to_csv("/dors/capra_lab/users/yand1/PTB_evo_analysis/results/2018_06_11/snps/" + str(snp) +".csv", index = False ,sep = '\t')