In [14]:
import os, shutil, csv, hashlib
import os
import glob
import numpy as np
import pandas as pd
from IPython.display import display

import re


In [15]:
# Reduced pangenome fasta file 
fasta_path = "/Users/921623492/Ecoli_Project/Data/reduced_pangenome_blast.fa"

In [16]:
def extract_gene_names_from_fasta(fasta_path):
    with open(fasta_path, "r") as f:
        gene_names = [line[1:].strip() for line in f if line.startswith(">")]
    return gene_names


In [17]:
gene_names = extract_gene_names_from_fasta(fasta_path)

In [18]:
len(gene_names)

15695

In [19]:
# Optional: validate
assert len(gene_names) == 15629, f"Expected 15629 gene names, got {len(gene_names)}"

AssertionError: Expected 15629 gene names, got 15695

In [20]:
gene_names = gene_names[:15629]

In [21]:

assert len(gene_names) == 15629, f"Expected 15629 gene names, got {len(gene_names)}"

# Step 2: Replace the Columns with Actual Gene Names

In [22]:
INPUT_DIR  = "/Users/921623492/Ecoli_Project/Data/merged_alignment"
OUTPUT_DIR = "/Users/921623492/Ecoli_Project/Data/MAF_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [23]:
file_paths = glob.glob(INPUT_DIR+"/*_pangenome_alignment.npy")
if len(file_paths) == 0:
    raise FileNotFoundError("No .npy files found in the 'alignment' directory!")

In [24]:
len(file_paths)

600

In [25]:
file_paths = glob.glob(INPUT_DIR+"/*_pangenome_alignment.npy")
if len(file_paths) == 0:
    raise FileNotFoundError("No .npy files found in the 'alignment' directory!")

In [13]:
length_matrix = np.zeros((len(file_paths), 15629), dtype=int)
for i, p in enumerate(file_paths):
    arr = np.load(p, allow_pickle=True)
    for j in range(len(arr)):
        length_matrix[i, j] = len(arr[j])

In [26]:
# Create DataFrame
df_length_matrix = pd.DataFrame(length_matrix, columns=gene_names)
df_length_matrix.head()

Unnamed: 0,FAHFDEJI_02221,GNOIHJGF_04610,NHLJOMBL_05016,KCNLHOEA_03403,HHCOCAHC_02102,KMKPFFJF_01017,MAOGMHBA_04903,CKMBOFAF_03765,FAHFDEJI_00308,ECKICHIN_03110,...,NHLJOMBL_00571,GHOPNDEC_03154,AEGEGBCH_03641,DMMNDODG_04815,EDEFNFDI_05015,EDEFNFDI_05016,BPDJECED_04485,DMLEFFEM_04762,EDHNABJO_03523,LBOLKHKO_00606
0,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
1,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
2,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,728,204,318,330,264
3,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
4,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264


In [27]:
df_length_matrix.index = [f"SAMPLE_{i+1}" for i in range(length_matrix.shape[0])]

In [28]:
df_length_matrix.head()

Unnamed: 0,FAHFDEJI_02221,GNOIHJGF_04610,NHLJOMBL_05016,KCNLHOEA_03403,HHCOCAHC_02102,KMKPFFJF_01017,MAOGMHBA_04903,CKMBOFAF_03765,FAHFDEJI_00308,ECKICHIN_03110,...,NHLJOMBL_00571,GHOPNDEC_03154,AEGEGBCH_03641,DMMNDODG_04815,EDEFNFDI_05015,EDEFNFDI_05016,BPDJECED_04485,DMLEFFEM_04762,EDHNABJO_03523,LBOLKHKO_00606
SAMPLE_1,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
SAMPLE_2,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
SAMPLE_3,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,728,204,318,330,264
SAMPLE_4,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
SAMPLE_5,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264


In [29]:
# Create new column names with index prefix and rename columns 
indexed_columns = [f"{i+1:05d}_{name}" for i, name in enumerate(df_length_matrix.columns)]
df_length_matrix.columns = indexed_columns

In [30]:
df_length_matrix.head()

Unnamed: 0,00001_FAHFDEJI_02221,00002_GNOIHJGF_04610,00003_NHLJOMBL_05016,00004_KCNLHOEA_03403,00005_HHCOCAHC_02102,00006_KMKPFFJF_01017,00007_MAOGMHBA_04903,00008_CKMBOFAF_03765,00009_FAHFDEJI_00308,00010_ECKICHIN_03110,...,15620_NHLJOMBL_00571,15621_GHOPNDEC_03154,15622_AEGEGBCH_03641,15623_DMMNDODG_04815,15624_EDEFNFDI_05015,15625_EDEFNFDI_05016,15626_BPDJECED_04485,15627_DMLEFFEM_04762,15628_EDHNABJO_03523,15629_LBOLKHKO_00606
SAMPLE_1,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
SAMPLE_2,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
SAMPLE_3,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,728,204,318,330,264
SAMPLE_4,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264
SAMPLE_5,579,948,381,198,549,501,2262,141,294,1350,...,189,294,207,1638,1638,714,204,318,330,264


In [31]:
df_length_matrix['00008_CKMBOFAF_03765']

SAMPLE_1      141
SAMPLE_2      141
SAMPLE_3      141
SAMPLE_4      141
SAMPLE_5      141
             ... 
SAMPLE_596    141
SAMPLE_597    141
SAMPLE_598    141
SAMPLE_599    141
SAMPLE_600    141
Name: 00008_CKMBOFAF_03765, Length: 600, dtype: int64

In [32]:
df_length_matrix['00008_CKMBOFAF_03765'].unique()

array([141, 142])

# Step 3: Generate and Save gene_metadata.csv 

In [33]:
# full_column_names = df_length_matrix.columns.to_list()
# gene_ids = [name.split("_", 1)[-1] for name in full_column_names]
# position = list(range(1, len(full_column_names) + 1))
# gene_metada = pd.DataFrame({
#     "Gene_Position": position,
#     "Gene_ID": gene_ids,
#     "Full_Column_Name": full_column_names
# })
# gene_metada.to_csv("gene_metadata.csv", index=False)

In [34]:
# df_gene_metada = pd.DataFrame(gene_metada)
# df_gene_metada.tail()

# Step 4: Explotora Data Analysis 

- To summarize how a gene behaves across population
- To find outliers genes or strange annotation lengths genes
Sumarry : 
Genes with low standard deviation have consistent lengths across all samples → likely core, highly conserved genes.

Genes with high standard deviation vary in length across samples → these might be:

Associated with structural variation (e.g., insertions/deletions)

Possibly mobile elements, phage genes, or accessory genes

Good candidates for further biological investigation or predictors in classification models

In [35]:
styled_stats = (
    df_length_matrix.describe()
    .T
    .sort_values("std", ascending=False)
    .head(200)
    .style
    .background_gradient(subset=["std"], cmap="Reds")
    .format("{:.2f}")
)

display(styled_stats)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
08193_FHCDHGOE_02712,600.0,11430.23,51.0,11421.0,11421.0,11421.0,11421.0,11712.0
08205_KCNLHOEA_04186,600.0,6500.21,38.25,6492.0,6492.0,6498.0,6498.0,6801.0
08216_IHAIOHNP_03683,600.0,5470.38,26.99,5466.0,5466.0,5466.0,5466.0,5658.0
08211_GHOPNDEC_03413,600.0,5953.95,21.15,5949.0,5949.0,5955.0,5955.0,6249.0
08222_HHCOCAHC_03062,600.0,5037.73,17.61,5025.0,5025.0,5025.0,5061.0,5073.0
08786_DFDLJMNG_03427,600.0,1878.51,12.3,1872.0,1872.0,1872.0,1873.0,1926.0
08230_MLKACMOG_02311,600.0,4650.51,11.65,4650.0,4650.0,4650.0,4650.0,4935.0
11526_CCKCHNGL_04532,600.0,765.12,11.6,750.0,750.0,774.0,774.0,774.0
13073_GGCADGEP_02086,600.0,1137.71,11.04,1128.0,1128.0,1128.0,1149.0,1155.0
08334_MGMMNLMA_03882,600.0,3137.96,10.4,3129.0,3129.0,3129.0,3150.0,3150.0


In [26]:
arr.shape

(15629,)

In [27]:
# for i, a in enumerate(arr):
#     print(f"arr[{i}] shape: {a.shape}")

In [28]:
len(arr)

15629

In [29]:
arr.shape

(15629,)

In [30]:
arr

array([array(['a', 't', 'g', 't', 'c', 't', 'a', 't', 'a', 'c', 'a', 'g', 'a',
              'a', 'c', 'g', 'a', 'a', 'a', 't', 'g', 'c', 'c', 't', 'g', 'g',
              't', 't', 'a', 'c', 'a', 'a', 'c', 'g', 'a', 'a', 'a', 't', 'g',
              'a', 'a', 'c', 'c', 'a', 'g', 't', 'a', 't', 'c', 't', 'g', 'a',
              'a', 'c', 'c', 'a', 'a', 'c', 'a', 'a', 'g', 'g', 'g', 'a', 'c',
              'g', 'g', 'g', 't', 'c', 't', 'g', 'a', 'c', 'c', 'c', 'c', 't',
              'g', 'c', 't', 'g', 'a', 'g', 'a', 't', 'g', 'c', 'a', 't', 'g',
              'g', 't', 't', 't', 'a', 'a', 't', 'c', 'a', 'g', 'c', 'g', 'g',
              'g', 'a', 't', 'g', 'a', 't', 'a', 't', 'g', 't', 'g', 'g', 'c',
              'g', 'g', 't', 'a', 'a', 'c', 'g', 'a', 't', 'g', 'a', 'c', 'a',
              'g', 'c', 't', 'c', 'a', 't', 'g', 'g', 'c', 't', 'g', 'c', 'c',
              'g', 'c', 't', 'a', 'c', 't', 't', 'c', 'a', 'c', 'g', 'a', 'c',
              'c', 't', 'g', 'a', 'c', 'g', 'a', 'a'

In [31]:
print(type(arr), arr.shape, arr.dtype)
# → <class 'numpy.ndarray'>, (1,), object

inner = arr[0]
print(type(inner), inner.shape, inner.dtype)
# → <class 'numpy.ndarray'>, (15629,), '<U1'


<class 'numpy.ndarray'> (15629,) object
<class 'numpy.ndarray'> (579,) <U1


In [36]:
combined_matrix = []
sample_ids = []

# Step 4: Load arrays from .npy files
for path in file_paths:
    arr = np.load(path, allow_pickle=True)

    combined_matrix.append(arr)

    # Extract sample ID from filename
    sample_id = os.path.basename(path).split(".")[0]
    sample_ids.append(sample_id)

# Step 5: Convert to DataFrame (genes as columns, samples as rows)
combined_df = pd.DataFrame(np.array(combined_matrix), index=sample_ids, columns=indexed_columns)
combined_df.index.name = "Sample_ID"

In [37]:
combined_df.shape

(600, 15629)

In [38]:
combined_df.head()

Unnamed: 0_level_0,00001_FAHFDEJI_02221,00002_GNOIHJGF_04610,00003_NHLJOMBL_05016,00004_KCNLHOEA_03403,00005_HHCOCAHC_02102,00006_KMKPFFJF_01017,00007_MAOGMHBA_04903,00008_CKMBOFAF_03765,00009_FAHFDEJI_00308,00010_ECKICHIN_03110,...,15620_NHLJOMBL_00571,15621_GHOPNDEC_03154,15622_AEGEGBCH_03641,15623_DMMNDODG_04815,15624_EDEFNFDI_05015,15625_EDEFNFDI_05016,15626_BPDJECED_04485,15627_DMLEFFEM_04762,15628_EDHNABJO_03523,15629_LBOLKHKO_00606
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mills_cip_200__GCA_025783615,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
kallonen_cip_200__ERR434265_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
kallonen_cip_200__ERR439575_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[-, -, -, -, -, -, c, t, g, t, a, c, a, a, t, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
gladstone-cip-200__ERR4036619_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, a, c, g, t, t, a, t, a, a, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[a, t, g, c, g, g, c, t, t, g, c, c, c, g, g, ...","[a, t, g, c, a, g, t, t, t, g, t, c, a, t, g, ...","[a, t, g, t, c, g, g, c, c, a, c, t, g, a, g, ..."
kallonen_cip_200__ERR435151_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


In [39]:
print(combined_df.memory_usage(deep=True).sum() / 1_048_576, "MiB")


1248.4336729049683 MiB


In [43]:

import math
from tqdm.auto import tqdm
import pandas as pd

chunksize = 250000                # tweak to keep CPU busy but RAM happy
rows      = len(combined_df)
n_chunks  = math.ceil(rows / chunksize)

csv_path  = "600_samples_with_genes.csv"

with tqdm(total=n_chunks, desc="Saving CSV") as pbar:
    for i, start in enumerate(range(0, rows, chunksize)):
        end   = start + chunksize
        chunk = combined_df.iloc[start:end]

        # write first chunk with header, later chunks in append-mode without header
        chunk.to_csv(
            csv_path,
            mode="w" if i == 0 else "a",
            header=(i == 0),
            index=False
        )

Saving CSV:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
combined_df.head()

Unnamed: 0_level_0,00001_FAHFDEJI_02221,00002_GNOIHJGF_04610,00003_NHLJOMBL_05016,00004_KCNLHOEA_03403,00005_HHCOCAHC_02102,00006_KMKPFFJF_01017,00007_MAOGMHBA_04903,00008_CKMBOFAF_03765,00009_FAHFDEJI_00308,00010_ECKICHIN_03110,...,15620_NHLJOMBL_00571,15621_GHOPNDEC_03154,15622_AEGEGBCH_03641,15623_DMMNDODG_04815,15624_EDEFNFDI_05015,15625_EDEFNFDI_05016,15626_BPDJECED_04485,15627_DMLEFFEM_04762,15628_EDHNABJO_03523,15629_LBOLKHKO_00606
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mills_cip_200__GCA_025783615,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
kallonen_cip_200__ERR434265_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
kallonen_cip_200__ERR439575_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[-, -, -, -, -, -, c, t, g, t, a, c, a, a, t, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
gladstone-cip-200__ERR4036619_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, a, c, g, t, t, a, t, a, a, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[a, t, g, c, g, g, c, t, t, g, c, c, c, g, g, ...","[a, t, g, c, a, g, t, t, t, g, t, c, a, t, g, ...","[a, t, g, t, c, g, g, c, c, a, c, t, g, a, g, ..."
kallonen_cip_200__ERR435151_pangenome_alignment,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


In [35]:


def extract_acc(idx):
    # split off the part after the double‐underscore
    part = idx.split("__", 1)[1]
    # grab either GCA_###… or ERR###…
    m = re.match(r'^(GCA_\d+|ERR\d+)', part)
    return m.group(1) if m else part

In [36]:
# apply it
combined_df.index = [extract_acc(i) for i in combined_df.index]

In [37]:
# give your index a name
combined_df.index.name = "Sample_ID"


In [38]:
combined_df.tail()

Unnamed: 0_level_0,00001_FAHFDEJI_02221,00002_GNOIHJGF_04610,00003_NHLJOMBL_05016,00004_KCNLHOEA_03403,00005_HHCOCAHC_02102,00006_KMKPFFJF_01017,00007_MAOGMHBA_04903,00008_CKMBOFAF_03765,00009_FAHFDEJI_00308,00010_ECKICHIN_03110,...,15620_NHLJOMBL_00571,15621_GHOPNDEC_03154,15622_AEGEGBCH_03641,15623_DMMNDODG_04815,15624_EDEFNFDI_05015,15625_EDEFNFDI_05016,15626_BPDJECED_04485,15627_DMLEFFEM_04762,15628_EDHNABJO_03523,15629_LBOLKHKO_00606
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR4034483,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, a, a, a, c, a, a, a, a, c, a, a, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[a, t, g, c, g, g, c, t, t, g, c, c, c, g, g, ...","[a, t, g, c, a, g, t, t, t, g, t, c, a, t, g, ...","[a, t, g, t, c, g, g, c, c, a, c, t, g, a, g, ..."
ERR434535,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[a, t, g, c, g, g, c, t, t, g, c, c, c, g, g, ...","[a, t, g, c, a, g, t, t, t, g, t, c, a, t, g, ...","[a, t, g, t, c, g, g, c, c, a, c, t, g, a, g, ..."
ERR4035923,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[-, -, g, a, t, g, a, a, a, a, a, c, c, g, t, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[g, t, g, g, c, g, c, t, g, t, a, c, a, a, t, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
GCA_023858765,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[a, t, g, c, g, g, a, a, g, a, g, t, a, t, g, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
ERR4034123,"[a, t, g, t, c, t, a, t, a, c, a, g, a, a, c, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, a, g, t, a, a, g, a, t, t, a, t, c, ...","[a, t, g, c, c, a, c, t, a, a, t, t, a, t, c, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[-, t, g, t, t, a, g, c, g, a, a, a, c, t, t, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[a, t, g, t, c, c, c, a, g, a, t, a, g, a, a, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


In [39]:
combined_df.iloc[0]['00001_FAHFDEJI_02221']

array(['a', 't', 'g', 't', 'c', 't', 'a', 't', 'a', 'c', 'a', 'g', 'a',
       'a', 'c', 'g', 'a', 'a', 'a', 't', 'g', 'c', 'c', 't', 'g', 'g',
       't', 't', 'a', 'c', 'a', 'a', 'c', 'g', 'a', 'a', 'a', 't', 'g',
       'a', 'a', 'c', 'c', 'a', 'g', 't', 'a', 't', 'c', 't', 'g', 'a',
       'a', 'c', 'c', 'a', 'a', 'c', 'a', 'a', 'g', 'g', 'g', 'a', 'c',
       'g', 'g', 'g', 't', 'c', 't', 'g', 'a', 'c', 'c', 'c', 'c', 't',
       'g', 'c', 't', 'g', 'a', 'g', 'a', 't', 'g', 'c', 'a', 't', 'g',
       'g', 't', 't', 't', 'a', 'a', 't', 'c', 'a', 'g', 't', 'g', 'g',
       'g', 'a', 't', 'g', 'a', 't', 'a', 't', 'g', 't', 'g', 'g', 'c',
       'g', 'g', 't', 'a', 'a', 'c', 'g', 'a', 't', 'g', 'a', 'c', 'a',
       'g', 'c', 't', 'c', 'a', 't', 'g', 'g', 'c', 't', 'g', 'c', 'c',
       'g', 'c', 't', 'a', 'c', 't', 't', 'c', 'a', 'c', 'g', 'a', 'c',
       'c', 't', 'g', 'a', 'c', 'g', 'a', 'a', 'c', 'g', 'a', 'a', 'g',
       'g', 'c', 'a', 't', 'g', 'g', 'c', 't', 't', 't', 'c', 'g

# Look Up Gene Names in NCBI 

### Step 1:  Extract your DNA sequence from the DataFrame


In [2]:
gene_id = "00001_FAHFDEJI_02221"
gene_id

'00001_FAHFDEJI_02221'

In [3]:
char_Array = combined_df[gene_id].dropna().iloc[0]

NameError: name 'combined_df' is not defined

In [91]:
char_Array

array(['a', 't', 'g', 't', 'c', 't', 'a', 't', 'a', 'c', 'a', 'g', 'a',
       'a', 'c', 'g', 'a', 'a', 'a', 't', 'g', 'c', 'c', 't', 'g', 'g',
       't', 't', 'a', 'c', 'a', 'a', 'c', 'g', 'a', 'a', 'a', 't', 'g',
       'a', 'a', 'c', 'c', 'a', 'g', 't', 'a', 't', 'c', 't', 'g', 'a',
       'a', 'c', 'c', 'a', 'a', 'c', 'a', 'a', 'g', 'g', 'g', 'a', 'c',
       'g', 'g', 'g', 't', 'c', 't', 'g', 'a', 'c', 'c', 'c', 'c', 't',
       'g', 'c', 't', 'g', 'a', 'g', 'a', 't', 'g', 'c', 'a', 't', 'g',
       'g', 't', 't', 't', 'a', 'a', 't', 'c', 'a', 'g', 't', 'g', 'g',
       'g', 'a', 't', 'g', 'a', 't', 'a', 't', 'g', 't', 'g', 'g', 'c',
       'g', 'g', 't', 'a', 'a', 'c', 'g', 'a', 't', 'g', 'a', 'c', 'a',
       'g', 'c', 't', 'c', 'a', 't', 'g', 'g', 'c', 't', 'g', 'c', 'c',
       'g', 'c', 't', 'a', 'c', 't', 't', 'c', 'a', 'c', 'g', 'a', 'c',
       'c', 't', 'g', 'a', 'c', 'g', 'a', 'a', 'c', 'g', 'a', 'a', 'g',
       'g', 'c', 'a', 't', 'g', 'g', 'c', 't', 't', 't', 'c', 'g

In [92]:
sequence = "".join(char_Array)
sequence

'atgtctatacagaacgaaatgcctggttacaacgaaatgaaccagtatctgaaccaacaagggacgggtctgacccctgctgagatgcatggtttaatcagtgggatgatatgtggcggtaacgatgacagctcatggctgccgctacttcacgacctgacgaacgaaggcatggctttcggtcatgagctggcacaggcactgcgcaaaatgcactctgccaccagcgatgccctgcaggatgacggcttcctttttcagctttatctacctgatggcgatgatgtcagcgttttcgatcgggctgatgcgctggctggttgggtcaatcacttcctgcttggtcttggcgttacgcaaccgaagctggacaaagtgaccggcgaaaccggtgaagccatcgacgatctgcgtaacatcgcgcagttgggttacgacgaagacgaagatcaggaagagcttgaaatgtcgcttgaagagatcatcgagtacgtccgtgttgccgcgctgttatgccacgacacctttactcatccgcaaccgaccgcgccagaagtacaaaaaccgactctacactaa'

In [95]:
# build a Fasta-formatted string 
fasta_query = f">{gene_id}\n{sequence}\n"

In [96]:
fasta_query

'>00001_FAHFDEJI_02221\natgtctatacagaacgaaatgcctggttacaacgaaatgaaccagtatctgaaccaacaagggacgggtctgacccctgctgagatgcatggtttaatcagtgggatgatatgtggcggtaacgatgacagctcatggctgccgctacttcacgacctgacgaacgaaggcatggctttcggtcatgagctggcacaggcactgcgcaaaatgcactctgccaccagcgatgccctgcaggatgacggcttcctttttcagctttatctacctgatggcgatgatgtcagcgttttcgatcgggctgatgcgctggctggttgggtcaatcacttcctgcttggtcttggcgttacgcaaccgaagctggacaaagtgaccggcgaaaccggtgaagccatcgacgatctgcgtaacatcgcgcagttgggttacgacgaagacgaagatcaggaagagcttgaaatgtcgcttgaagagatcatcgagtacgtccgtgttgccgcgctgttatgccacgacacctttactcatccgcaaccgaccgcgccagaagtacaaaaaccgactctacactaa\n'

### Step 2: Submit your FASTA to NCBI’s remote BLAST


In [1]:
from Bio.Blast import NCBIWWW, NCBIXML 

print("Sending to NCBI Blast this can take 10-30 s")
result_handle = NCBIWWW.qblast(
    program = "blastn",
    database = "nt",
    sequence=fasta_query,
    format_type="XML",
    hitlist_size = 5 # return top 5 hits 
)


Sending to NCBI Blast this can take 10-30 s


NameError: name 'fasta_query' is not defined