In [106]:
#### Now since this is a new file we need to import other modules (YAY!) including our R' emulator
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

### and let's not forget the usual gang:
import pandas as pd
import numpy as np
import matplotlib as plt
import re
import os 

In [107]:
#### Before we get started in an R' environment. We need to talk about something important.
#### Rentrez does not use SNP ID to search. It uses UID. As in, they don't use 'rs2000' 
#### but rather '2000'. So we need to use our previous output and split the function.
Output_GeneSNP_file = pd.read_csv('Resources/OutputFromExtractionCode/Editable_Gene_And_dbSNP_ID_Merged.csv')
Output_GeneSNP_file.head(10)

output_path = 'Resources/OutputFromExtractionCode'

In [108]:
### Now we split it. 
Split_dbSNP_ID = Output_GeneSNP_file
Split_dbSNP_ID[['rs', 'UID']] = Output_GeneSNP_file['SNP ID'].str.split('rs', expand = True)
Split_dbSNP_ID.head(10)

#And delete the rs column:
UID_SNP = Split_dbSNP_ID.drop(columns=['rs'])
UID_SNP.head(10)

## Now we have the UID

Unnamed: 0,GENE,SNP ID,UID
0,CDKN2B-AS1,rs4977574,4977574
1,LPA,rs10455872,10455872
2,ZNF335,rs3827066,3827066
3,CELSR2,rs12740374,12740374
4,LINC00540 - FTH1P7,rs7994761,7994761
5,APOE,rs429358,429358
6,SMARCA4,rs73015011,73015011
7,ADAMTS8,rs4936098,4936098
8,CHRNA5,rs17486278,17486278
9,CDKN2B-AS1,rs2891168,2891168


In [109]:
### Let's import RENTREZ to our python environment
rentrez = importr('rentrez')

### You'll need to provide an email address to use Entrez tools. BTW Hot tip: 
### If you have multiple lines, using triple quotes is cleaner and easier to read:

robjects.r('''
library(rentrez)
entrez_email <- "kenji.ponsm@gmail.com"
''')

In [110]:
## Let's give it a test shall we? Let's say I want the information of rs429358 (GENE: APOE)
print(UID_SNP.iloc[0,2])
test_snp = UID_SNP.iloc[0,2]
search_test = rentrez.entrez_summary(db='snp', id=test_snp)
search_test_dict = {key: str(search_test.rx2(key) [0]) for key in search_test.names}

search_test_df = pd.DataFrame([search_test_dict])

search_test_df.head(10)



4977574


Unnamed: 0,uid,snp_id,allele_origin,global_mafs,global_population,global_samplesize,suspected,clinical_significance,genes,acc,...,allele,snp_class,chrpos,chrpos_prev_assm,text,snp_id_sort,clinical_sort,cited_sort,chrpos_sort,merged_sort
0,4977574,4977574,,"[1] ""1000Genomes"" ""ALSPAC"" ""Chilea...",,,,risk-factor,"[1] ""CDKN2B-AS1""\n",NC_000009.12,...,D,snv,9:22098575,9:22098574,,4977574,1,,22098575,0


In [111]:
print(search_test.names)


 [1] "uid"                   "snp_id"                "allele_origin"        
 [4] "global_mafs"           "global_population"     "global_samplesize"    
 [7] "suspected"             "clinical_significance" "genes"                
[10] "acc"                   "chr"                   "handle"               
[13] "spdi"                  "fxn_class"             "validated"            
[16] "docsum"                "tax_id"                "orig_build"           
[19] "upd_build"             "createdate"            "updatedate"           
[22] "ss"                    "allele"                "snp_class"            
[25] "chrpos"                "chrpos_prev_assm"      "text"                 
[28] "snp_id_sort"           "clinical_sort"         "cited_sort"           
[31] "chrpos_sort"           "merged_sort"          



In [112]:
### And let's filter for the actual values we want to extract: 
desired_dbSNP_values = (['genes', 'snp_id', 'uid', 'snp_class', 
                         'fxn_class', 'chr', 'chrpos', 'global_population', 
                         'allele','clinical_significance', 'acc'] 
                         )
filtered_search_test_dict = {key: str(search_test.rx2(key)[0]) for key in search_test.names if key in desired_dbSNP_values}
filtered_search_test = pd.DataFrame([filtered_search_test_dict])
filtered_search_test

Unnamed: 0,uid,snp_id,global_population,clinical_significance,genes,acc,chr,fxn_class,allele,snp_class,chrpos
0,4977574,4977574,,risk-factor,"[1] ""CDKN2B-AS1""\n",NC_000009.12,9,"genic_downstream_transcript_variant,intron_var...",D,snv,9:22098575


In [113]:
filtered_and_ordered_search_test = filtered_search_test[[col for col in desired_dbSNP_values if col in filtered_search_test.columns]]
filtered_and_ordered_search_test


Unnamed: 0,genes,snp_id,uid,snp_class,fxn_class,chr,chrpos,global_population,allele,clinical_significance,acc
0,"[1] ""CDKN2B-AS1""\n",4977574,4977574,snv,"genic_downstream_transcript_variant,intron_var...",9,9:22098575,,D,risk-factor,NC_000009.12


In [114]:
all_snp_data = []

# Open a text file in write mode to log skipped rows
with open("skipped_rows_log.txt", "w") as log_file:
    log_file.write("Details of rows skipped due to None UID:\n\n")

    for index, row in UID_SNP.iterrows():
        test_snp = row['UID']
        
        # Check if the UID is not None
        if test_snp is None:
            # Print to console and write to log file
            log_message = f"Skipping row {index} because UID is None. Row details:\n{row}\n\n"
            print(log_message)
            log_file.write(log_message)
            continue  # Skip this iteration if UID is None

        try: 
            # Retrieve SNP data from the SNP database
            search_test = rentrez.entrez_summary(db='snp', id=test_snp)
            
            # Filter and safely retrieve desired fields
            filtered_search_test_dict = {
                key: str(search_test.rx2(key)[0]) if len(search_test.rx2(key)) > 0 else None
                for key in search_test.names if key in desired_dbSNP_values
            }
            
            # Order the data to match the desired column order
            ordered_data = {key: filtered_search_test_dict.get(key, None) for key in desired_dbSNP_values}
            
            # Append ordered data to the results list
            all_snp_data.append(ordered_data)

        except Exception as e:
            error_message = f"Error retrieving data for UID {test_snp}: {e}\n"
            print(error_message)
            log_file.write(error_message)

# Convert results to DataFrame
final_df = pd.DataFrame(all_snp_data)

# Display the first 10 rows
final_df.head(50)


Skipping row 53 because UID is None. Row details:
GENE                 NaN
SNP ID    chr6:112398675
UID                 None
Name: 53, dtype: object


Skipping row 116 because UID is None. Row details:
GENE                NaN
SNP ID    chr2:20679060
UID                None
Name: 116, dtype: object


Skipping row 140 because UID is None. Row details:
GENE                  NaN
SNP ID    chr11:130410772
UID                  None
Name: 140, dtype: object




Unnamed: 0,genes,snp_id,uid,snp_class,fxn_class,chr,chrpos,global_population,allele,clinical_significance,acc
0,"[1] ""CDKN2B-AS1""\n",4977574,4977574,snv,"genic_downstream_transcript_variant,intron_var...",9,9:22098575,,D,risk-factor,NC_000009.12
1,"[1] ""LPA""\n",10455872,10455872,snv,intron_variant,6,6:160589086,,R,benign,NC_000006.12
2,"[1] ""ZNF335""\n",3827066,3827066,snv,"downstream_transcript_variant,intron_variant,g...",20,20:45957384,,Y,benign,NC_000020.11
3,"[1] ""CELSR2""\n",12740374,12740374,snv,3_prime_UTR_variant,1,1:109274968,,K,association,NC_000001.11
4,,7994761,7994761,snv,,13,13:22297307,,D,,NC_000013.11
5,"[1] ""APOE""\n",429358,429358,snv,"missense_variant,coding_sequence_variant",19,19:44908684,,Y,"pathogenic,risk-factor,association,conflicting...",NC_000019.10
6,,73015011,73015011,snv,,19,19:11079088,,Y,,NC_000019.10
7,"[1] ""ADAMTS8""\n",4936098,4936098,snv,intron_variant,11,11:130410772,,D,,NC_000011.10
8,"[1] ""CHRNA5""\n",17486278,17486278,snv,intron_variant,15,15:78575140,,M,,NC_000015.10
9,"[1] ""CDKN2B-AS1""\n",2891168,2891168,snv,"genic_downstream_transcript_variant,intron_var...",9,9:22098620,,R,,NC_000009.12


In [115]:
#### Now let's make a copy of our RAW results:
final_df.to_csv(os.path.join(output_path, 'compiled_SNPs_RENTREZ_RAW.csv'), index=False)


In [116]:
#### And let's make it beautiful by rearranging and sorting each column:
#### First, let's delete all variants that are not SNVs:
filtered_final_df = final_df.loc[final_df['snp_class'] == 'snv'] 
filtered_final_df

Unnamed: 0,genes,snp_id,uid,snp_class,fxn_class,chr,chrpos,global_population,allele,clinical_significance,acc
0,"[1] ""CDKN2B-AS1""\n",4977574,4977574,snv,"genic_downstream_transcript_variant,intron_var...",9,9:22098575,,D,risk-factor,NC_000009.12
1,"[1] ""LPA""\n",10455872,10455872,snv,intron_variant,6,6:160589086,,R,benign,NC_000006.12
2,"[1] ""ZNF335""\n",3827066,3827066,snv,"downstream_transcript_variant,intron_variant,g...",20,20:45957384,,Y,benign,NC_000020.11
3,"[1] ""CELSR2""\n",12740374,12740374,snv,3_prime_UTR_variant,1,1:109274968,,K,association,NC_000001.11
4,,7994761,7994761,snv,,13,13:22297307,,D,,NC_000013.11
...,...,...,...,...,...,...,...,...,...,...,...
145,"[1] ""C5orf67""\n",464605,464605,snv,non_coding_transcript_variant,5,5:56511543,,H,,NC_000005.10
146,"[1] ""ANKRD44"" ""ANKRD44-IT1""\n",919433,919433,snv,"genic_upstream_transcript_variant,intron_variant",2,2:197301841,,V,,NC_000002.12
147,,530948381,530948381,snv,,22,22:25507825,,R,,NC_000022.11
148,"[1] ""MON1A""\n",142344547,142344547,snv,intron_variant,3,3:49918603,,R,,NC_000003.12


In [118]:
### Now let's clean up la mugre of each cell in the genes column

# Define a function to clean up each cell
def clean_cell(value):
    if isinstance(value, str):
        # Remove `[1]`, quotation marks, and newline characters
        cleaned_value = re.sub(r'^\[\d+\]\s*"', '', value).replace('"', '').strip()
        return cleaned_value
    return value

# Apply the cleaning function to the 'genes' column using .loc
filtered_final_df.loc[:, 'genes'] = filtered_final_df['genes'].apply(clean_cell)

filtered_final_df.head(200)

Unnamed: 0,genes,snp_id,uid,snp_class,fxn_class,chr,chrpos,global_population,allele,clinical_significance,acc
0,CDKN2B-AS1,4977574,4977574,snv,"genic_downstream_transcript_variant,intron_var...",9,9:22098575,,D,risk-factor,NC_000009.12
1,LPA,10455872,10455872,snv,intron_variant,6,6:160589086,,R,benign,NC_000006.12
2,ZNF335,3827066,3827066,snv,"downstream_transcript_variant,intron_variant,g...",20,20:45957384,,Y,benign,NC_000020.11
3,CELSR2,12740374,12740374,snv,3_prime_UTR_variant,1,1:109274968,,K,association,NC_000001.11
4,,7994761,7994761,snv,,13,13:22297307,,D,,NC_000013.11
...,...,...,...,...,...,...,...,...,...,...,...
145,C5orf67,464605,464605,snv,non_coding_transcript_variant,5,5:56511543,,H,,NC_000005.10
146,ANKRD44 ANKRD44-IT1,919433,919433,snv,"genic_upstream_transcript_variant,intron_variant",2,2:197301841,,V,,NC_000002.12
147,,530948381,530948381,snv,,22,22:25507825,,R,,NC_000022.11
148,MON1A,142344547,142344547,snv,intron_variant,3,3:49918603,,R,,NC_000003.12


In [None]:
#### Now let's make reorder and rename the columns so they match our RENTREZ template. 
RENTREZ_rename = filtered_final_df.rename(columns={'genes': 'Gene ID (GENE_ID)', 'snp_id': 'dbSNP ID (RS)'})
