# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import os
import geopandas as gpd
from Bio import Entrez
import time
from tqdm import tqdm
import requests
from Bio import Entrez
import gzip
import subprocess 
from scipy.spatial.distance import cdist
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Read the data

In [2]:
# Read the data 
df = pd.read_csv("/home/chandru/binp37/results/metasub/processed_metasub.csv")
df.head()

  df = pd.read_csv("/home/chandru/binp37/results/metasub/processed_metasub.csv")


Unnamed: 0,uuid,metasub_name,core_project,project,city,city_code,latitude,longitude,surface_material,control_type,...,cyanobacterium endosymbiont of Epithemia turgida,endosymbiont 'TC1' of Trimyema compressum,endosymbiont of Acanthamoeba sp. UWC8,endosymbiont of unidentified scaly snail isolate Monju,gamma proteobacterium HdN1,halophilic archaeon DL31,halophilic archaeon True-ADL,secondary endosymbiont of Ctenarytaina eucalypti,secondary endosymbiont of Heteropsylla cubana,uncultured crAssphage
0,haib17CEM4890_H75CGCCXY_SL263639,CSD16-HAM-001,core,CSD16,hamilton,HAM,-37.78333,175.28333,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,haib17CEM4890_H75CGCCXY_SL263651,CSD16-HAM-006,core,CSD16,hamilton,HAM,-37.78333,175.28333,,,...,0.0,0.0,0.0,1e-05,0.0,0.0,0.0,0.0,0.0,0.0
2,haib17CEM4890_H75CGCCXY_SL263663,CSD16-HAM-008,core,CSD16,hamilton,HAM,-37.78333,175.28333,,,...,0.0,0.0,0.0,2e-05,0.0,0.0,0.0,0.0,0.0,0.0
3,haib17CEM4890_H75CGCCXY_SL263675,CSD16-HAM-012,core,CSD16,hamilton,HAM,-37.78333,175.28333,,,...,0.0,0.0,0.0,2e-05,0.0,0.0,0.0,0.0,0.0,0.0
4,haib17CEM4890_H75CGCCXY_SL263687,CSD16-HAM-015,core,CSD16,hamilton,HAM,-37.78333,175.28333,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
rfe_df = pd.read_csv("/home/chandru/binp37/results/metasub/metasub_training_testing_data.csv")
rfe_df.head()

Unnamed: 0,Acidovorax ebreus,Acidovorax sp. JS42,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter haemolyticus,Acinetobacter johnsonii,Acinetobacter junii,Acinetobacter pittii,Acinetobacter schindleri,Acinetobacter sp. LoGeW2-3,...,Variovorax boronicumulans,Variovorax paradoxus,Variovorax sp. PAMC 28711,Veillonella parvula,Weissella cibaria,Xanthomonas campestris,continent,city,latitude,longitude
0,0.0,0.0,0.00023,0.00015,0.0,6e-05,1e-05,7e-05,0.0001,5e-05,...,0.00031,0.00075,0.00021,0.0,0.0,0.0048,oceania,hamilton,-37.78333,175.28333
1,0.0,1e-05,3e-05,0.00028,0.00016,0.00142,0.00017,0.00013,0.00262,0.0014,...,0.00013,0.00024,3e-05,0.0,0.0,0.00091,oceania,hamilton,-37.78333,175.28333
2,3e-05,0.0,0.00011,0.00181,0.0006,0.00274,0.0003,0.0011,0.00191,0.00132,...,0.0001,0.00025,1e-05,0.0,0.0,0.00208,oceania,hamilton,-37.78333,175.28333
3,0.0,0.0,0.0,2e-05,1e-05,3e-05,0.0,0.0,3e-05,1e-05,...,3e-05,2e-05,0.0,0.0,0.0,0.00137,oceania,hamilton,-37.78333,175.28333
4,0.0,0.0,0.0,3e-05,0.0,0.0,0.0,2e-05,9e-05,1e-05,...,4e-05,8e-05,3e-05,0.0,0.0,0.00397,oceania,hamilton,-37.78333,175.28333


# Geograpical features

In [4]:
feature_data = df[['city_total_population','city_population_density',
                  'city_land_area_km2','city_ave_june_temp_c','city_elevation_meters','city_koppen_climate','continent','city','latitude','longitude']]

# Fix city elevation of hanoi, yamaguchi in meters
feature_data.loc[feature_data['city'] == 'hanoi','city_elevation_meters'] = 12
feature_data.loc[feature_data['city'] == 'yamaguchi','city_elevation_meters'] = 23
feature_data.loc[feature_data['city'] == 'marseille','city_elevation_meters'] = 42 # city elevation of marseille on google is 42 m here it is 0

# Get city population density, city ladn ares in km2, city avg temp in june and city elevation in meters of offa 
offa_data = {
    'city_population_density': 2500.0,
    'city_land_area_km2': 74.0,
    'city_ave_june_temp_c': 28.0,
    'city_elevation_meters': 457.0
}

feature_data.loc[feature_data['city'] == 'offa', list(offa_data.keys())] = list(offa_data.values())

# Get city land area in km2 of marseille  
feature_data.loc[feature_data['city'] == 'marseille','city_land_area_km2'] = 240

# Fix all the nan values of london
london_data = {
    'city_total_population': 8787892.0,
    'city_population_density': 5590.0,
    'city_land_area_km2': 1572.0,
    'city_ave_june_temp_c': 14.4,
    'city_elevation_meters': 11.0,
    'city_koppen_climate': 'marine_west_coast_climate'
}
feature_data.loc[feature_data['city'] == 'london', list(london_data.keys())] = list(london_data.values())


feature_data.head()

Unnamed: 0,city_total_population,city_population_density,city_land_area_km2,city_ave_june_temp_c,city_elevation_meters,city_koppen_climate,continent,city,latitude,longitude
0,203100.0,230.0,877.0,17.8,95.0,marine_west_coast_climate,oceania,hamilton,-37.78333,175.28333
1,203100.0,230.0,877.0,17.8,95.0,marine_west_coast_climate,oceania,hamilton,-37.78333,175.28333
2,203100.0,230.0,877.0,17.8,95.0,marine_west_coast_climate,oceania,hamilton,-37.78333,175.28333
3,203100.0,230.0,877.0,17.8,95.0,marine_west_coast_climate,oceania,hamilton,-37.78333,175.28333
4,203100.0,230.0,877.0,17.8,95.0,marine_west_coast_climate,oceania,hamilton,-37.78333,175.28333


## Scaling the features

In [5]:
# Check for skewness in the data before appling long transformer -> 
# Note to self: The city_land_area_km2 is right skewed, so we will go with log scale transformation
#             : The city_elevation_meters is multi modal there we will go with QuantileTransformer


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, QuantileTransformer, OneHotEncoder
import numpy as np
import pandas as pd

# Step 1: Define input columns
#log_cols = ['city_land_area_km2']
#quantile_cols = ['city_elevation_meters']
scale_cols = ['city_total_population', 'city_ave_june_temp_c']
cat_cols = ['city_koppen_climate']

# Step 2: Log-transform function
#def safe_log1p(x):
#    return np.log1p(np.maximum(x, 0))

# Step 3: Create log pipeline
#log_pipeline = Pipeline([
#    ('log', FunctionTransformer(safe_log1p)),
#    ('scale', StandardScaler())
#])

# Step 4: Build the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
#    ('log', log_pipeline, log_cols),
#    ('quantile', QuantileTransformer(output_distribution='normal'), quantile_cols),
    ('scale', StandardScaler(), scale_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Step 5: Fit and transform
geo_features_processed = preprocessor.fit_transform(feature_data)

# Step 6: Extract column names correctly
output_feature_names = []

for name, transformer, cols in preprocessor.transformers_:
    if name == 'cat':
        # For OneHotEncoder
        encoder = transformer
        if isinstance(encoder, Pipeline):
            encoder = encoder.named_steps['onehot']
        cats = encoder.categories_[0]
        output_feature_names.extend([f"{cols[0]}_{cat}" for cat in cats])
    else:
        output_feature_names.extend(cols)

# Step 7: Convert to DataFrame
geo_features_df = pd.DataFrame(geo_features_processed)

# Step 8: Merge with main features (RFE-selected ones)
final_df = pd.concat([rfe_df, geo_features_df], axis=1)
final_df.to_csv("/home/chandru/binp37/results/metasub/metasub_geo_training_testing.csv", index=False)

print("Final dataset shape:", final_df.shape)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



Final dataset shape: (4070, 205)


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Step 1: Select your input columns
scale_cols = ['city_ave_june_temp_c']
#cat_cols = ['city_koppen_climate']

# Step 2: Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), scale_cols),
#        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# Step 3: Fit and transform the geo feature data
geo_features_processed = preprocessor.fit_transform(feature_data)

# Step 4: Get feature names
feature_names = []

# Handle scaled columns
feature_names.extend(scale_cols)

# Handle one-hot columns
#ohe = preprocessor.named_transformers_['cat']
#cat_feature_names = ohe.get_feature_names_out(cat_cols)
#feature_names.extend(cat_feature_names)

# Step 5: Convert to DataFrame
geo_features_df = pd.DataFrame(geo_features_processed.toarray() if hasattr(geo_features_processed, 'toarray') else geo_features_processed)

# Step 6: Merge with selected features and save
final_df = pd.concat([rfe_df.reset_index(drop=True), geo_features_df.reset_index(drop=True)], axis=1)
#final_df.to_csv("/home/chandru/binp37/results/metasub/metasub_geo_training_testing.csv", index=False)

print("Final dataset shape:", final_df.shape)


Final dataset shape: (4070, 205)


In [8]:
df = pd.read_csv("/home/chandru/binp37/results/metasub/metasub_geo_training_testing.csv")
df.head()

Unnamed: 0,Acidovorax ebreus,Acidovorax sp. JS42,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter haemolyticus,Acinetobacter johnsonii,Acinetobacter junii,Acinetobacter pittii,Acinetobacter schindleri,Acinetobacter sp. LoGeW2-3,...,Variovorax paradoxus,Variovorax sp. PAMC 28711,Veillonella parvula,Weissella cibaria,Xanthomonas campestris,continent,city,latitude,longitude,0
0,0.0,0.0,0.00023,0.00015,0.0,6e-05,1e-05,7e-05,0.0001,5e-05,...,0.00075,0.00021,0.0,0.0,0.0048,oceania,hamilton,-37.78333,175.28333,<Compressed Sparse Row sparse matrix of dtype ...
1,0.0,1e-05,3e-05,0.00028,0.00016,0.00142,0.00017,0.00013,0.00262,0.0014,...,0.00024,3e-05,0.0,0.0,0.00091,oceania,hamilton,-37.78333,175.28333,<Compressed Sparse Row sparse matrix of dtype ...
2,3e-05,0.0,0.00011,0.00181,0.0006,0.00274,0.0003,0.0011,0.00191,0.00132,...,0.00025,1e-05,0.0,0.0,0.00208,oceania,hamilton,-37.78333,175.28333,<Compressed Sparse Row sparse matrix of dtype ...
3,0.0,0.0,0.0,2e-05,1e-05,3e-05,0.0,0.0,3e-05,1e-05,...,2e-05,0.0,0.0,0.0,0.00137,oceania,hamilton,-37.78333,175.28333,<Compressed Sparse Row sparse matrix of dtype ...
4,0.0,0.0,0.0,3e-05,0.0,0.0,0.0,2e-05,9e-05,1e-05,...,8e-05,3e-05,0.0,0.0,0.00397,oceania,hamilton,-37.78333,175.28333,<Compressed Sparse Row sparse matrix of dtype ...


In [9]:
df.columns

Index(['Acidovorax ebreus', 'Acidovorax sp. JS42', 'Acidovorax sp. KKS102',
       'Acinetobacter baumannii', 'Acinetobacter haemolyticus',
       'Acinetobacter johnsonii', 'Acinetobacter junii',
       'Acinetobacter pittii', 'Acinetobacter schindleri',
       'Acinetobacter sp. LoGeW2-3',
       ...
       'Variovorax paradoxus', 'Variovorax sp. PAMC 28711',
       'Veillonella parvula', 'Weissella cibaria', 'Xanthomonas campestris',
       'continent', 'city', 'latitude', 'longitude', '0'],
      dtype='object', length=205)

# Microbiome features

In [10]:
# We can get the raw sequence of all these top hundered species and get a phylogenetic tree to determine the relationship between species.
# We can then use the information as well as a feature to predict the lat and long.

microbe_data = rfe_df.iloc[:,:-4]
microbe_data

Unnamed: 0,Acidovorax ebreus,Acidovorax sp. JS42,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter haemolyticus,Acinetobacter johnsonii,Acinetobacter junii,Acinetobacter pittii,Acinetobacter schindleri,Acinetobacter sp. LoGeW2-3,...,Thermothelomyces thermophila,Thielavia terrestris,Truepera radiovictrix,Tsukamurella sp. MH1,Variovorax boronicumulans,Variovorax paradoxus,Variovorax sp. PAMC 28711,Veillonella parvula,Weissella cibaria,Xanthomonas campestris
0,0.00000,0.00000,0.00023,0.00015,0.00000,0.00006,0.00001,0.00007,0.00010,0.00005,...,0.00000,0.00000,0.00000,0.00000,0.00031,0.00075,0.00021,0.00000,0.00000,0.00480
1,0.00000,0.00001,0.00003,0.00028,0.00016,0.00142,0.00017,0.00013,0.00262,0.00140,...,0.00001,0.00001,0.00000,0.00000,0.00013,0.00024,0.00003,0.00000,0.00000,0.00091
2,0.00003,0.00000,0.00011,0.00181,0.00060,0.00274,0.00030,0.00110,0.00191,0.00132,...,0.00000,0.00002,0.00000,0.00000,0.00010,0.00025,0.00001,0.00000,0.00000,0.00208
3,0.00000,0.00000,0.00000,0.00002,0.00001,0.00003,0.00000,0.00000,0.00003,0.00001,...,0.00001,0.00000,0.00000,0.00000,0.00003,0.00002,0.00000,0.00000,0.00000,0.00137
4,0.00000,0.00000,0.00000,0.00003,0.00000,0.00000,0.00000,0.00002,0.00009,0.00001,...,0.00000,0.00000,0.00000,0.00000,0.00004,0.00008,0.00003,0.00000,0.00000,0.00397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4065,0.00044,0.00052,0.00019,0.00072,0.00015,0.01410,0.00036,0.00046,0.00052,0.00024,...,0.00091,0.00124,0.00009,0.00000,0.00037,0.00107,0.00016,0.00042,0.00013,0.00132
4066,0.00000,0.00000,0.00022,0.00014,0.00000,0.00019,0.00000,0.00000,0.00000,0.00000,...,0.01762,0.00703,0.00000,0.00105,0.00147,0.00350,0.00113,0.00012,0.00002,0.00079
4067,0.00003,0.00002,0.00002,0.00018,0.00009,0.00055,0.00006,0.00019,0.00009,0.00001,...,0.00023,0.00031,0.00002,0.00006,0.00005,0.00016,0.00011,0.00057,0.00001,0.00043
4068,0.00026,0.00038,0.00051,0.00009,0.00000,0.00025,0.00004,0.00010,0.00006,0.00000,...,0.00443,0.00535,0.00000,0.00186,0.00128,0.00304,0.00126,0.00013,0.00008,0.00106


## Phylogenetic Trees

In [11]:
species_list = []
for name in microbe_data.columns:
    species_list.append(name)
    

tax_df = pd.read_csv("/home/chandru/binp37/results/metasub/taxonomic_info.csv")
lin_df = tax_df[tax_df['Species'].isin(species_list)].dropna(axis=1,how='all')
lin_df = lin_df.dropna(subset=lin_df.columns[1:7]).iloc[:,:7]
lin_df.head()

Unnamed: 0,Species,Rank_1,Rank_2,Rank_3,Rank_4,Rank_5,Rank_6
36,Acidovorax ebreus,cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,Betaproteobacteria,Burkholderiales
37,Acidovorax sp. JS42,cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,Betaproteobacteria,Burkholderiales
38,Acidovorax sp. KKS102,cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,Betaproteobacteria,Burkholderiales
44,Acinetobacter baumannii,cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,Gammaproteobacteria,Moraxellales
48,Acinetobacter haemolyticus,cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,Gammaproteobacteria,Moraxellales


In [12]:
print(np.unique(lin_df['Rank_1'],return_counts=True)[0],np.unique(lin_df['Rank_1'],return_counts=True)[1])

['Viruses' 'cellular organisms'] [  3 193]


In [20]:
# Print the unique names in each rank
for rank in lin_df.columns[1:]:
    unique_names = lin_df[rank].unique()
    print(f"Unique names in {rank}: {len(unique_names)}")
    print(unique_names)
    print("\n")

Unique names in Rank_1: 2
['cellular organisms' 'Viruses']


Unique names in Rank_2: 5
['Bacteria' 'Eukaryota' 'Duplodnaviria' 'Varidnaviria' 'Archaea']


Unique names in Rank_3: 8
['Pseudomonadati' 'Bacillati' 'Opisthokonta' 'Thermotogati'
 'Heunggongvirae' 'Bamfordvirae' 'Discoba' 'Methanobacteriati']


Unique names in Rank_4: 11
['Pseudomonadota' 'Actinomycetota' 'Bacillota'
 'Cyanobacteriota/Melainabacteria group' 'Fungi' 'FCB group'
 'Deinococcota' 'Uroviricota' 'Preplasmiviricota' 'Euglenozoa'
 'Methanobacteriota']


Unique names in Rank_5: 15
['Betaproteobacteria' 'Gammaproteobacteria' 'Actinomycetes' 'Bacilli'
 'Alphaproteobacteria' 'Cyanobacteriota' 'Dikarya'
 'Bacteroidota/Chlorobiota group' 'Deinococci' 'Caudoviricetes'
 'Tectiliviricetes' 'Kinetoplastea' 'Methanomada group' 'Rubrobacteria'
 'Negativicutes']


Unique names in Rank_6: 33
['Burkholderiales' 'Moraxellales' 'Actinomycetales' 'Aeromonadales'
 'Alteromonadales' 'Micrococcales' 'Bacillales' 'Hyphomicrobiales'
 'Cau

In [None]:

Entrez.email = "1ms19bt011@gmail.com" # Remember to set your actual email

def download_genome(species, output_dir="genomes"):
    """
    Downloads the complete genome for a given species from NCBI RefSeq,
    handling both FTP and HTTP URLs.

    Args:
        species (str): The scientific name of the species (e.g., "Escherichia coli").
        output_dir (str): The directory where the genome file will be saved.

    Returns:
        bool: True if the genome was successfully downloaded and decompressed, False otherwise.
    """
    os.makedirs(output_dir, exist_ok=True)

    search_terms = [
        f'"{species}"[Organism] AND "complete genome"[Assembly Level]',
        f'"{species}"[Organism] AND "reference genome"[Refseq Category]',
        f'"{species}"[Organism] AND latest[filter]',
        f'"{species}"[Organism]' # Broadest term as a last resort
    ]

    for term_index, term in enumerate(search_terms):
        print(f"Searching for '{species}' with term: '{term}'")
        try:
            # Search for latest RefSeq assembly
            handle = Entrez.esearch(db="assembly", term=term, retmax=1)
            record = Entrez.read(handle)
            handle.close() # Always close the handle

            if record["IdList"]:
                assembly_id = record["IdList"][0]
                print(f"Found assembly ID: {assembly_id} for {species}")

                # Fetch summary to get FTP path
                summary_handle = Entrez.esummary(db="assembly", id=assembly_id)
                doc = Entrez.read(summary_handle)
                summary_handle.close() # Always close the handle

                ftp_path = doc["DocumentSummarySet"]["DocumentSummary"][0]["FtpPath_RefSeq"]
                if ftp_path:
                    filename_stem = ftp_path.split("/")[-1]
                    fasta_url = f"{ftp_path}/{filename_stem}_genomic.fna.gz"
                    output_gz_path = os.path.join(output_dir, f"{species.replace(' ', '_')}.fna.gz")
                    output_fna_path = os.path.join(output_dir, f"{species.replace(' ', '_')}.fna")

                    print(f"Attempting to download from: {fasta_url}")

                    try:
                        if fasta_url.startswith("ftp://"):
                            # Use wget for FTP paths
                            print(f"Using wget for FTP download: {fasta_url}")
                            # -q for quiet, -O for output file, --show-progress for progress bar
                            # --no-verbose for cleaner output
                            # Use subprocess.run for better control and error handling than os.system
                            result = subprocess.run(
                                ["wget", "--no-verbose", "--show-progress", "-O", output_gz_path, fasta_url],
                                check=True, # Raise CalledProcessError if wget returns non-zero exit code
                                capture_output=True, # Capture stdout/stderr for debugging if needed
                                text=True # Decode stdout/stderr as text
                            )
                            # print(result.stdout) # Uncomment for detailed wget output
                            # print(result.stderr) # Uncomment for detailed wget output
                            print(f"Downloaded {species} to {output_gz_path} using wget.")
                        else:
                            # Use requests for HTTP/HTTPS paths
                            print(f"Using requests for HTTP/HTTPS download: {fasta_url}")
                            response = requests.get(fasta_url, stream=True)
                            response.raise_for_status() # Raise an exception for HTTP errors

                            total_size_in_bytes = int(response.headers.get('content-length', 0))
                            block_size = 1024 # 1 Kibibyte
                            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True, desc=f"Downloading {species}")

                            with open(output_gz_path, 'wb') as f:
                                for chunk in response.iter_content(chunk_size=block_size):
                                    progress_bar.update(len(chunk))
                                    f.write(chunk)
                            progress_bar.close()

                            if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
                                print("ERROR, something went wrong during download!")
                                return False
                            print(f"Downloaded {species} to {output_gz_path} using requests.")

                        # Decompress the file, regardless of how it was downloaded
                        print(f"Decompressing {output_gz_path}...")
                        with gzip.open(output_gz_path, 'rb') as f_in:
                            with open(output_fna_path, 'wb') as f_out:
                                f_out.write(f_in.read())
                        os.remove(output_gz_path) # Remove the compressed file
                        print(f"Decompressed to {output_fna_path}")
                        return True
                    except subprocess.CalledProcessError as sub_e:
                        print(f"wget failed for {species} from {fasta_url}: {sub_e}")
                        print(f"wget stdout: {sub_e.stdout}")
                        print(f"wget stderr: {sub_e.stderr}")
                        continue # Try next search term
                    except requests.exceptions.RequestException as req_e:
                        print(f"Download failed for {species} from {fasta_url}: {req_e}")
                        continue # Try next search term
                    except Exception as download_e:
                        print(f"An unexpected error occurred during download/decompression for {species}: {download_e}")
                        continue # Try next search term
                else:
                    print(f"No FTP path found for {species} with term '{term}'. Trying next search term.")
            else:
                print(f"No assembly found for {species} with term '{term}'. Trying next search term.")
            time.sleep(1) # Small delay between Entrez calls to be polite
        except Exception as e:
            print(f"Error during Entrez search for {species} with term '{term}': {e}")
            time.sleep(2) # Longer delay if Entrez call itself fails
    print(f"Failed to download genome for {species} after trying all search terms.")
    return False


output_directory = "genomes"
os.makedirs(output_directory, exist_ok=True)

# Process each species in the list
print("\nStarting genome download process...")
for species in tqdm(filtered_species_list[:], desc="Overall Genome Download Progress"):
    print(f"\nProcessing species: {species}")
    success = download_genome(species, output_directory)
    if not success:
        print(f"Could not download genome for {species}. Please check the species name or try again later.")
    time.sleep(2) # Respect NCBI rate limits between species

## Clutering using K-means

In [None]:
# Elbow method to determine ideal cluster size -> Note I am getting the cut iff to be 15.
# Calculate inertia for k=1 to 50
inertias = []
for k in range(1, 50):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(np.array(microbe_data))
    inertias.append(kmeans.inertia_)

# Plot Elbow Curve
plt.plot(range(1, 50), inertias, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=15, random_state=42, n_init="auto").fit(np.array(microbe_data))
kmeans.cluster_centers_


centorid_distances = cdist(np.array(microbe_data),kmeans.cluster_centers_,"euclidean")
closet_indices = np.argmin(centorid_distances,axis=0)

augment_data = pd.concat([microbe_data,pd.DataFrame(centorid_distances)],axis=1)
augment_data