In [1]:
# Before we start, remember to activate your conda environment.
## Write 'conda create --name myenv numpy pandas scipy matplotlib' into your terminal
## Then activate the environment: 'conda activate myenv'
## Remember to check that your Python interpreter is correct

# First step: Import the libraries we're going to use (PANDAS, NUMPY, MATPLOTLIB)
import pandas as pd
import numpy as np
import matplotlib as plt
import os 

In [2]:
# Second step: We're going to read our Resources.csv and test to see if we can see it.
SNP_file = pd.read_csv('Resources/Template.csv')    #You should always open the Data_Extraction_Code 
                                                       #folder to have a relative path similar to this one
SNP_file.head()

Unnamed: 0,DOI,GENE,SNP ID,P-VALUE (GWAS)


In [3]:
# Third step: Now let's create a second dataframe (to keep from editing the original)
SNP_editable = SNP_file
SNP_editable.head()

Unnamed: 0,DOI,GENE,SNP ID,P-VALUE (GWAS)


In [4]:
#Fourth step: Let's merge the dataframe and use SNP ID as our merge.
# Let's read our second .csv file
Our_SNP_List = pd.read_csv('Resources/Input_Files/AA_SNP_Extraction.csv')
Our_SNP_List.head()

# And...

Unnamed: 0,DOI,STUDY,GENE,SNP ID,HGVS NOMENCLATURE,"PROTECTIVE EFFECT (0=YES, 1=NO)",MAF,ALLELE,P-VALUE (GWAS),SAMPLE SIZE,POPULATION / ETHNICITY,"TYPE OF ANEURYSM\n(0=UNDEFINED, 1=SACCULAR, 2=FUSIFORM, 3=MULTIPLE ANEURYSMS (UNDEFINED), 4= MULTIPLE ANEURYSMS (SACCULAR), 5=MULTIPLE ANEURYSMS (FUSIFORM), 6=MULTIPLE ANEURYSMS (MIXED)",ANEURYSM SITE,NOTES
0,,www.ncbi.nlm.nih.gov/pubmed/37845353,CDKN2B-AS1,rs4977574,,,,,2e-114,,,,Abdominal Aorta,
1,,www.ncbi.nlm.nih.gov/pubmed/37845353,LPA,rs10455872,,,,,2e-55,,,,Abdominal Aorta,
2,,www.ncbi.nlm.nih.gov/pubmed/37845353,ZNF335,rs3827066,,,,,1e-48,,,,Abdominal Aorta,
3,,www.ncbi.nlm.nih.gov/pubmed/37845353,CELSR2,rs12740374,,,,,1.9999999999999998e-42,,,,Abdominal Aorta,
4,,www.ncbi.nlm.nih.gov/pubmed/37845353,LINC00540 - FTH1P7,rs7994761,,,,,1.0000000000000001e-39,,,,Abdominal Aorta,


In [5]:
# MERGE! 
merge_data = pd.merge(Our_SNP_List, SNP_file, on=["SNP ID", "GENE", "DOI", "P-VALUE (GWAS)"], how='left')
merge_data.head(10)

# Now we have our own database set up as a dataframe and filled it with our columns. 
# If you merged two databases, great! If you did not, boo! Let's output the merged_data (if you did not merge two databases,
# then ignore the output.
output_path = 'Resources/OutputFromExtractionCode'
merge_data.to_csv(os.path.join(output_path, 'Merged_data_1.csv'), index=False)


#merge_data.to_csv('Merged_data_1.csv', index=False)

In [8]:
# Now let's drop all the data we collected and just stay with the two things that matter most to us: 
# the SNP ID and the Gene:
SNP_ID_And_Gene_From_Merged_Data = merge_data[['GENE', 'SNP ID']]
SNP_ID_And_Gene_From_Merged_Data.head(10)

Unnamed: 0,GENE,SNP ID
0,CDKN2B-AS1,rs4977574
1,LPA,rs10455872
2,ZNF335,rs3827066
3,CELSR2,rs12740374
4,LINC00540 - FTH1P7,rs7994761
5,APOE,rs429358
6,SMARCA4,rs73015011
7,ADAMTS8,rs4936098
8,CHRNA5,rs17486278
9,CDKN2B-AS1,rs2891168


In [9]:
### We'll be using this for our RENTREZ module in R'. Let's output it twice (one for keeps and one we can modify safely)
SNP_ID_And_Gene_From_Merged_Data.to_csv(os.path.join(output_path, 'Original_Gene_And_dbSNP_ID_Merged.csv'), index=False)
SNP_ID_And_Gene_From_Merged_Data.to_csv(os.path.join(output_path, 'Editable_Gene_And_dbSNP_ID_Merged.csv'), index=False)

In [11]:
### We can already do something pretty cool, which is determine which gene had more pathogenic SNPs.
### We can sort it:
SNP_Sorted_By_Gene =  SNP_ID_And_Gene_From_Merged_Data.sort_values(by='GENE')
SNP_Sorted_By_Gene.head(10)

Unnamed: 0,GENE,SNP ID
119,ABCA6,rs117753190
61,ADAM10,rs11856657
40,ADAMTS8,rs4936098
142,ADAMTS8,rs7936928
7,ADAMTS8,rs4936098
147,ADH1C - ADH7,rs1229849
149,"ANKRD44-IT1, ANKRD44",rs919433
114,ANTXR1,rs13384676
118,APOA5 - LNC-RHL1,rs662799
44,APOE,rs429358


In [12]:
### And get the unique count for each gene:
Unique_Genes = SNP_Sorted_By_Gene['GENE'].value_counts().reset_index()
Unique_Genes.columns = ['GENE', 'Reported SNPs']
Unique_Genes.head(20)

Unnamed: 0,GENE,Reported SNPs
0,CDKN2B-AS1,4
1,LPA,4
2,DAB2IP,3
3,LINC00540 - FTH1P7,3
4,ADAMTS8,3
5,ERG,3
6,RPS4XP18 - RNU6-1032P,3
7,IL6R,3
8,GDF7,2
9,LDLR,2


In [13]:
### And we should group them by gene so we can see which SNPs are actually related:
grouped_SNPs = SNP_Sorted_By_Gene.groupby('GENE')['SNP ID'].apply(list).reset_index()
grouped_SNPs.head(20)

# groupby('category') groups the data by the 'category' column.
# apply(list) applies the list function to each group, resulting in a list of strings for each unique category.
# reset_index() is used to turn the result back into a DataFrame.

Unnamed: 0,GENE,SNP ID
0,ABCA6,[rs117753190]
1,ADAM10,[rs11856657]
2,ADAMTS8,"[rs4936098, rs7936928, rs4936098]"
3,ADH1C - ADH7,[rs1229849]
4,"ANKRD44-IT1, ANKRD44",[rs919433]
5,ANTXR1,[rs13384676]
6,APOA5 - LNC-RHL1,[rs662799]
7,APOE,"[rs429358, rs429358]"
8,ATOH8,[rs113626898]
9,ATP7BP1 - RPS4XP18,[rs7240701]


In [14]:
### Now let's output them as CSV files:
SNP_Sorted_By_Gene.to_csv(os.path.join(output_path, 'SNP_Sorted_By_Gene.csv'), index=False)
Unique_Genes.to_csv(os.path.join(output_path, 'Unique_Genes.csv'), index=False)
grouped_SNPs.to_csv(os.path.join(output_path, 'grouped_SNPs.csv'), index=False)


In [None]:
#### Now let's turn to our python R file