In [None]:
# Initializing/mounting Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Reading in packages and needed data

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/gene_neighbors_genes.csv")
df.head()

Unnamed: 0,source_gene,gene_set,neighbor_type
0,Gene::8546,COVID-19,seed
1,Gene::23476,COVID-19,seed
2,Gene::6046,COVID-19,seed
3,Gene::10283,COVID-19,seed
4,Gene::124245,COVID-19,seed


In [None]:
import requests
import time

def get_gene_name_ncbi(gene_id, max_retries=5):
    """
    Fetches  gene name for a given gene identifier using NCBI's Entrez API
    """
    try:
        # Extract the numeric part of the gene_id and add url for API
        numeric_id = gene_id.split("::")[1]
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
        params = {
            "db": "gene",  # Search in the gene database
            "id": numeric_id,  # The numeric part of the gene ID
            "retmode": "json"  # Get the response in JSON format
        }

        retries = 0

        # Retry the request until max_retries is reached.
        while retries < max_retries:
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                data = response.json()

                # Check if the result exists and contains the requested gene ID
                if "result" in data and numeric_id in data["result"]:
                    gene_data = data["result"][numeric_id]
                    return gene_data.get("name", "Gene name not found.")
                else:
                  # Handle cases where the gene ID is not found in the response
                    return "Gene not found in NCBI database."

            # Handle rate limiting errors
            elif response.status_code == 429:
                retries += 1

                # Exponential backoff for retries (1, 2, 4, etc. seconds).
                wait_time = 1 ** retries
                print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                # Handle other HTTP errors by returning the status code
                return f"Error: Received status code {response.status_code}."

        # If retries are exhausted, return an error message.
        return "Error: Max retries exceeded due to rate limiting."
    except Exception as e:

        # Handle unexpected errors gracefully and return the error message
        return f"An error occurred: {str(e)}"

def get_covid_19_gene_names(df):
    """
    Filters the DataFrame for rows where gene_set is 'COVID_19' and
    fetches gene names from NCBI.
    """
    # Filter rows where gene_set is COVID-19
    covid_genes = df[df['gene_set'] == 'COVID-19']
    print(len(covid_genes))

    results = []

    # Iterate over the rows and fetch gene names
    for index, row in covid_genes.iterrows():
        source_gene = row['source_gene']
        print(f"Analyzing gene: {source_gene} (Index {index})")
        gene_name = get_gene_name_ncbi(source_gene)

        # Select only 'source_gene' and 'gene_name' columns
        results.append({'source_gene': source_gene, 'gene_name': gene_name})

    # Convert the results to a DataFrame
    result_df = pd.DataFrame(results)
    return result_df

# Display the resulting table
result_df = get_covid_19_gene_names(df)
print(result_df)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Analyzing gene: Gene::92235 (Index 16434)
Analyzing gene: Gene::83932 (Index 16435)
Analyzing gene: Gene::3175 (Index 16436)
Rate limit hit. Retrying in 1 seconds...
Analyzing gene: Gene::253582 (Index 16437)
Analyzing gene: Gene::9752 (Index 16438)
Analyzing gene: Gene::64180 (Index 16439)
Rate limit hit. Retrying in 1 seconds...
Analyzing gene: Gene::255027 (Index 16440)
Analyzing gene: Gene::55957 (Index 16441)
Analyzing gene: Gene::286144 (Index 16442)
Rate limit hit. Retrying in 1 seconds...
Analyzing gene: Gene::219902 (Index 16443)
Analyzing gene: Gene::79946 (Index 16444)
Analyzing gene: Gene::7975 (Index 16445)
Rate limit hit. Retrying in 1 seconds...
Analyzing gene: Gene::100500938 (Index 16446)
Analyzing gene: Gene::312 (Index 16447)
Analyzing gene: Gene::54910 (Index 16448)
Rate limit hit. Retrying in 1 seconds...
Analyzing gene: Gene::3208 (Index 16449)
Analyzing gene: Gene::3642 (Index 16450)
Analyzing gene:

In [None]:
# Add a column with the list of gene names
gene_names_list = result_df['gene_name'].tolist()
result_df['gene_names'] = [gene_names_list] * len(result_df)
print(gene_names_list)

# Save the DataFrame to a CSV file
result_df.to_pickle("final.pkl")

['AP3B1', 'BRD4', 'BRD2', 'CWC27', 'ZC3H18', 'SLC44A2', 'PMPCB', 'YIF1A', 'ATP1B1', 'ACADM', 'ETFA', 'STOM', 'GGCX', 'ATP6V1A', 'PSMD8', 'REEP5', 'PMPCA', 'ANO6', 'PITRM1', 'SLC30A9', 'FASTKD5', 'SLC30A7', 'TUBGCP3', 'COQ8B', 'SAAL1', 'REEP6', 'INTS4', 'SLC25A21', 'TUBGCP2', 'TARS2', 'RTN4', 'FAM8A1', 'AASS', 'AKAP8L', 'AAR2', 'BZW2', 'RRP9', 'PABPC1', 'CSNK2A2', 'CSNK2B', 'G3BP1', 'PABPC4', 'LARP1', 'FAM98A', 'SNIP1', 'UPF1', 'MOV10', 'G3BP2', 'DDX21', 'RBM28', 'RPL36', 'GOLGA7', 'ZDHHC5', 'POLA1', 'PRIM1', 'PRIM2', 'POLA2', 'COLGALT1', 'PKP2', 'AP2A2', 'GFER', 'ERGIC1', 'AP2M1', 'GRPEL1', 'TBCA', 'SBNO1', 'BCKDK', 'AKAP8', 'MYCBP2', 'SLU7', 'RIPK1', 'UBAP2L', 'TYSND1', 'PDZD11', 'PRRC2B', 'UBAP2', 'ZNF318', 'CRTC3', 'USP54', 'ZC3H7A', 'LARP4B', 'RBM41', 'TCF12', 'PPIL3', 'PLEKHA5', 'TBKBP1', 'CIT', 'HSBP1', 'PCNT', 'CEP43', 'PRKAR2A', 'PRKACA', 'PRKAR2B', 'RDX', 'CENPF', 'TLE1', 'TLE3', 'TLE5', 'GOLGA3', 'GOLGA2', 'GOLGB1', 'GRIPAP1', 'CEP350', 'PDE4DIP', 'CEP135', 'CEP68', 'CNTRL', 

In [None]:
def split_and_print_list(gene_names_list, chunk_size=2000):
    """
    Splits the `gene_names_list` into chunks of a specified size and prints each chunk on a new line.
    Allows to more easily read the list.
    """
    for i in range(0, len(gene_names_list), chunk_size):
        print(gene_names_list[i:i + chunk_size], "\n")

split_and_print_list(gene_names_list)

['AP3B1', 'BRD4', 'BRD2', 'CWC27', 'ZC3H18', 'SLC44A2', 'PMPCB', 'YIF1A', 'ATP1B1', 'ACADM', 'ETFA', 'STOM', 'GGCX', 'ATP6V1A', 'PSMD8', 'REEP5', 'PMPCA', 'ANO6', 'PITRM1', 'SLC30A9', 'FASTKD5', 'SLC30A7', 'TUBGCP3', 'COQ8B', 'SAAL1', 'REEP6', 'INTS4', 'SLC25A21', 'TUBGCP2', 'TARS2', 'RTN4', 'FAM8A1', 'AASS', 'AKAP8L', 'AAR2', 'BZW2', 'RRP9', 'PABPC1', 'CSNK2A2', 'CSNK2B', 'G3BP1', 'PABPC4', 'LARP1', 'FAM98A', 'SNIP1', 'UPF1', 'MOV10', 'G3BP2', 'DDX21', 'RBM28', 'RPL36', 'GOLGA7', 'ZDHHC5', 'POLA1', 'PRIM1', 'PRIM2', 'POLA2', 'COLGALT1', 'PKP2', 'AP2A2', 'GFER', 'ERGIC1', 'AP2M1', 'GRPEL1', 'TBCA', 'SBNO1', 'BCKDK', 'AKAP8', 'MYCBP2', 'SLU7', 'RIPK1', 'UBAP2L', 'TYSND1', 'PDZD11', 'PRRC2B', 'UBAP2', 'ZNF318', 'CRTC3', 'USP54', 'ZC3H7A', 'LARP4B', 'RBM41', 'TCF12', 'PPIL3', 'PLEKHA5', 'TBKBP1', 'CIT', 'HSBP1', 'PCNT', 'CEP43', 'PRKAR2A', 'PRKACA', 'PRKAR2B', 'RDX', 'CENPF', 'TLE1', 'TLE3', 'TLE5', 'GOLGA3', 'GOLGA2', 'GOLGB1', 'GRIPAP1', 'CEP350', 'PDE4DIP', 'CEP135', 'CEP68', 'CNTRL', 

In [None]:
# Filter the required columns
filtered_result_df = result_df[['source_gene', 'gene_name']]

# Save to a pickle file
pickle_file_path = 'result_genes.pkl'  # Specify the path where you want to save the file
filtered_result_df.to_pickle(pickle_file_path)

In [None]:
# Viewing final df and saving df as csv
print(filtered_result_df)
filtered_result_df.to_csv("final_genes.csv", index=False)

           source_gene gene_name
0           Gene::8546     AP3B1
1          Gene::23476      BRD4
2           Gene::6046      BRD2
3          Gene::10283     CWC27
4         Gene::124245    ZC3H18
...                ...       ...
20181  Gene::100188891   ZNF123P
20182      Gene::12505      Cd44
20183       Gene::2561    GABRB2
20184     Gene::256586    LYSMD2
20185       Gene::1415    CRYBB2

[20186 rows x 2 columns]
