# Metasub data mGPS algorithm - 31/03/2025

Here I am just trying to get the data pre-processing steps right. The idea for doing this is to get the dataset in the right format for easier analysis using neural networks.

## Pre-processing the data

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import datasets
os.chdir("/home/inf-21-2024/binp37/")
# Read the metadata for the metasub data.
complete_meta = pd.read_csv("./data/metasub/complete_metadata.csv")
taxa_abund = pd.read_csv("./data/metasub/metasub_taxa_abundance.csv")
taxa_abund = taxa_abund.drop_duplicates(subset=['uuid'])

In [3]:
# Merge the bacterial and metadata
metasub_data = pd.merge(complete_meta,taxa_abund,on='uuid')
metasub_data.shape

(4288, 3711)

In [4]:
# Remove control samples
control_cities = {'control','other_control','neg_control','other','pos_control'}
control_types = {'ctrl cities','negative_control','positive_control'}

mask = metasub_data['city'].isin(control_cities) | metasub_data['control_type'].isin(control_types)
metasub_data = metasub_data[~mask].copy()
metasub_data.shape

(4157, 3711)

In [5]:
#Re-label london boroughs
metasub_data.loc[metasub_data['city'].isin(['kensington','islington']),'city'] = 'london'
metasub_data.shape

(4157, 3711)

In [6]:
# Remove sparse sample locations and doubtful samples
city_counts = metasub_data['city'].value_counts()
small_cities = city_counts[city_counts<8].index.tolist()
remove_samples = metasub_data['city'].isin(['antarctica']+small_cities)
metasub_data = metasub_data[~remove_samples]
metasub_data.shape

(4070, 3711)

In [7]:
# Correct the identified mislabeling of data
kyiv_filter = metasub_data['city'] == 'kyiv'
metasub_data.loc[kyiv_filter,'latitude'] = metasub_data.loc[kyiv_filter,'city_latitude'] # Set all the latitude to the city_latitude
metasub_data.loc[kyiv_filter,'longitude'] = metasub_data.loc[kyiv_filter,'city_longitude'] # Set all the latitude to the city_longitutde

porto_filter = metasub_data['city'] == 'porto'
metasub_data.loc[porto_filter,'city'] = "europe"
metasub_data.shape

(4070, 3711)

In [8]:
# Fill missing latitude and longitude values with city-level data
missing_lat = metasub_data["latitude"].isna()
missing_lon = metasub_data["longitude"].isna()
metasub_data.loc[missing_lat, "latitude"] = metasub_data.loc[missing_lat, "city_latitude"]
metasub_data.loc[missing_lon, "longitude"] = metasub_data.loc[missing_lon, "city_longitude"]
metasub_data.shape

(4070, 3711)

In [9]:
# Correction for incorrect London co-ordinates
london_filter = metasub_data['city'] == 'london'
metasub_data.loc[london_filter,'city_latitude'] = 51.50853
metasub_data.loc[london_filter,'city_longitude'] = -0.12574
metasub_data.shape

(4070, 3711)

In [96]:
metasub_data.iloc[:,:42].columns

Index(['uuid', 'metasub_name', 'core_project', 'project', 'city', 'city_code',
       'latitude', 'longitude', 'surface_material', 'control_type',
       'elevation', 'line', 'station', 'surface', 'temperature', 'traffic',
       'setting', 'num_reads', 'library_post_PCR_Qubit',
       'library_QC_concentration', 'city_latitude', 'city_longitude',
       'coastal_city', 'city_total_population', 'city_population_density',
       'city_land_area_km2', 'city_ave_june_temp_c', 'city_elevation_meters',
       'continent', 'city_koppen_climate', 'barcode', 'ha_id',
       'hudson_alpha_flowcell', 'hudson_alpha_project', 'index_sequence',
       'location_type', 'hudson_alpha_uid', 'other_project_uid',
       'plate_number', 'plate_pos', 'sample_type', 'sl_name'],
      dtype='object')

## Recursive Feature Elimination

Here we use RFE as a part of a pipeline to get the best set of parameters for fitting the deep learning model. We need a suitable set of parameters for that are infromative enough to apply a deep learning model. These are called as geographically informative taxa (GITs).

In [109]:
# Importing libraries
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from joblib import Parallel, delayed
from sklearn.pipeline import make_pipeline
import multiprocessing
import time
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

*Explanation of the Code*

This function is designed to perform feature selection by:

1. **Removing Highly Correlated Features:**
   - Computes a correlation matrix of the features.
   - Identifies features with correlation greater than 0.98 and removes them.

2. **Recursive Feature Elimination (RFE) for Feature Selection:**
   - Uses a Random Forest Classifier to determine feature importance.
   - Applies RFE (Recursive Feature Elimination) to iteratively remove the least important features.
   - The number of features to keep is determined based on predefined subset sizes.

3. **Parallel Processing Support:**  
   Uses multiple CPU cores when specified for efficiency.


In [74]:
# Feature selection algorithm
def species_select(X,y,remove_correlated=True,subsets=None,cores=1):
    """
    Feature selection algorithm for species classification.

    Parameters:
    X (pd.DataFrame): Feature matrix
    y (pd.Series): Target variable
    remove_correlated (bool): Whether we need to remoce highly correlated variables. (default is set to True)
    subsets (list): List of feature subset sizes to evaluate. If None, it is determined automatically.
    cores (int): Number of CPU cores to use for parallel computation. (default is set to 1)
    
    Returns:
    RFE object: Trained Recursive Feature Elimination (RFE) model.
    """

    start_time = time.time()  # Track execution time

    # Set parallel processing
    num_cores = multiprocessing.cpu_count() if cores > 1 else 1
    print(f"Using {num_cores} CPU cores for computation.")

    if remove_correlated:
        # Compute correlation matrix
        print("Calculating correlation matrix...")
        corr_matrix = X.corr()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # Identify correlated features (above 0.98)
        correlated_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.98)]

        # Drop correlated features
        X = X.drop(columns=correlated_features)
        print(f"Correlated feature removed: {len(correlated_features)}")

    # Determine default subset sizes if not provided
    num_features = X.shape[1]
    if subsets is None:
        subsets = [num_features // 2, num_features // 4, num_features // 8, num_features // 16, num_features // 32, num_features // 64]
        subsets = [s for s in subsets  if s > 0] # Remove non-positive values

    print(f"Feature selection subsets: {subsets}")

    # Define model (Random Forrest for fearure ranking)
    model = RandomForestClassifier(n_jobs=num_cores, random_state=123)
    print("Initialized RandomForestClassifier.")

    # Recursive Feature Elimination (RFE)
    for subset in subsets:
        print(f"\nStarting RFE with {subset} features....")
        start_rfe = time.time()
        rfe = RFE(estimator=model, n_features_to_select=min(subsets),step=20)
        # Fit RFE to the data
        rfe.fit(X,y)
        print(f"Completed RFE with {subset} features in {time.time() - start_rfe:.2f} seconds.")

    total_time = time.time() - start_time
    print(f"\nFeature selection completed in {total_time:.2f} seconds.")
    return rfe

In [None]:
featureElim = species_select(X=metasub_data.iloc[:,42:500],
                             y=metasub_data['city'],
                             remove_correlated=False,
                             subsets=[50,100,200,500],
                             cores=12)

### What You Can Do If It's Still Too Slow

1. **Reduce Feature Set Before Running RFE:**
    - If remove_correlated=False, you still have 3711 features.
    - Try using only top 500-1000 features based on variance or importance.

2. **Increase Step Size in RFE:**
    - Default step=1 removes one feature per iteration, which is slow.
    - Try step=5 or step=10 to remove multiple features at once:
    - rfe = RFE(estimatoe=model, n_features_to_select=subset, step=5)
3. **Use XGBoost Instead of Random Forest:**
    - model = XGBClassifier(n_jobs=num_cores, random_state=123)
4. **Sequential Feature Selector:**
    - from sklearn.feature_selection import SequentialFeatureSelector
    sfs = SequentialFeatureSelector(model, n_features_to_select=100, direction='backward', n_jobs=num_cores)
    sfs.fit(X, y)


In [116]:
# I can use ray for parallel processing. I just need to run this once to get the best number of features to select. So I can think about it whether I really need to use ray.
X = metasub_data.iloc[:,42:]
y= metasub_data['city']
model = RandomForestClassifier(n_jobs=12, random_state=123)
pipe = make_pipeline(RFE(estimator=model,step=20))
parameters= {"rfe__n_features_to_select":[50,100,200,300,500,1500]}
print(f"\nStartinf RFE with subsets features ...")

n_features_options = parameters['rfe__n_features_to_select']
cv = 10
total_iterations = len(n_features_options)*cv

results = []
start_time = time.time()

with tqdm(total=total_iterations, desc='GridSearchCV + RFE') as pbar:
    for n_features in n_features_options:
        pipe.set_params(rfe__n_features_to_select=n_features)
        fold_scores = []
        for fold in range(cv):
            # Manually perfrom cross-validation using StratifiedKFold
            skf = StratifiedKFold(n_splits=cv,shuffle=True,random_state=fold)
            splits = list(skf.split(X,y))
            train_index, test_index = splits[fold]
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipe.fit(X_train,y_train)
            score = pipe.score(X_test,y_test)
            fold_scores.append(score)
            pbar.update(1)
        
        mean_score = sum(fold_scores)/cv
        results.append((n_features,mean_score))

best_n_features, best_score = max(results, key=lambda x: x[1])

end_time = time.time()
elapsed_time = end_time - start_time

print('Best params: {"rfe__n_features_to_select":', best_n_features, "}")
print('Best accuracy:', best_score)
print(f"Total time taken: {elapsed_time:.2f} seconds")


Startinf RFE with subsets features ...


GridSearchCV + RFE:   0%|          | 0/60 [00:00<?, ?it/s]

GridSearchCV + RFE: 100%|██████████| 60/60 [3:40:13<00:00, 220.23s/it]

Best params: {"rfe__n_features_to_select": 200 }
Best accuracy: 0.8906633906633907
Total time taken: 13213.71 seconds





In [121]:
# Since I have lost the model, but I know that 200 features gives the best accuracy.
model_200 = RandomForestClassifier(n_jobs=24,random_state=123)
rfe = RFE(estimator=model_200,n_features_to_select=200,step=20)
rfe.fit(X,y)

selected_features = X.columns[rfe.support_]
print(selected_features)


# All the accuracy results from the previous runs
results_df = pd.DataFrame(results,columns=['n_vars','accuracy'])
results_df

Index(['Acidovorax ebreus', 'Acidovorax sp. JS42', 'Acidovorax sp. KKS102',
       'Acinetobacter baumannii', 'Acinetobacter haemolyticus',
       'Acinetobacter johnsonii', 'Acinetobacter junii',
       'Acinetobacter pittii', 'Acinetobacter schindleri',
       'Acinetobacter sp. LoGeW2-3',
       ...
       'Thermothelomyces thermophila', 'Thielavia terrestris',
       'Truepera radiovictrix', 'Tsukamurella sp. MH1',
       'Variovorax boronicumulans', 'Variovorax paradoxus',
       'Variovorax sp. PAMC 28711', 'Veillonella parvula', 'Weissella cibaria',
       'Xanthomonas campestris'],
      dtype='object', length=200)


Unnamed: 0,n_vars,accuracy
0,50,0.885504
1,100,0.889681
2,200,0.890663
3,300,0.886486
4,500,0.870762
5,1500,0.84914


In [131]:
nn_data = X.loc[:,selected_features]
nn_data = pd.concat([nn_data,metasub_data[['city','continent','latitude','longitude']]],axis=1)
nn_data

Unnamed: 0,Acidovorax ebreus,Acidovorax sp. JS42,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter haemolyticus,Acinetobacter johnsonii,Acinetobacter junii,Acinetobacter pittii,Acinetobacter schindleri,Acinetobacter sp. LoGeW2-3,...,Variovorax boronicumulans,Variovorax paradoxus,Variovorax sp. PAMC 28711,Veillonella parvula,Weissella cibaria,Xanthomonas campestris,city,continent,latitude,longitude
0,0.00000,0.00000,0.00023,0.00015,0.00000,0.00006,0.00001,0.00007,0.00010,0.00005,...,0.00031,0.00075,0.00021,0.00000,0.00000,0.00480,hamilton,oceania,-37.78333,175.28333
1,0.00000,0.00001,0.00003,0.00028,0.00016,0.00142,0.00017,0.00013,0.00262,0.00140,...,0.00013,0.00024,0.00003,0.00000,0.00000,0.00091,hamilton,oceania,-37.78333,175.28333
2,0.00003,0.00000,0.00011,0.00181,0.00060,0.00274,0.00030,0.00110,0.00191,0.00132,...,0.00010,0.00025,0.00001,0.00000,0.00000,0.00208,hamilton,oceania,-37.78333,175.28333
3,0.00000,0.00000,0.00000,0.00002,0.00001,0.00003,0.00000,0.00000,0.00003,0.00001,...,0.00003,0.00002,0.00000,0.00000,0.00000,0.00137,hamilton,oceania,-37.78333,175.28333
4,0.00000,0.00000,0.00000,0.00003,0.00000,0.00000,0.00000,0.00002,0.00009,0.00001,...,0.00004,0.00008,0.00003,0.00000,0.00000,0.00397,hamilton,oceania,-37.78333,175.28333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4191,0.00044,0.00052,0.00019,0.00072,0.00015,0.01410,0.00036,0.00046,0.00052,0.00024,...,0.00037,0.00107,0.00016,0.00042,0.00013,0.00132,london,europe,51.50000,-0.20000
4192,0.00000,0.00000,0.00022,0.00014,0.00000,0.00019,0.00000,0.00000,0.00000,0.00000,...,0.00147,0.00350,0.00113,0.00012,0.00002,0.00079,london,europe,51.50000,-0.20000
4193,0.00003,0.00002,0.00002,0.00018,0.00009,0.00055,0.00006,0.00019,0.00009,0.00001,...,0.00005,0.00016,0.00011,0.00057,0.00001,0.00043,london,europe,51.50000,-0.20000
4194,0.00026,0.00038,0.00051,0.00009,0.00000,0.00025,0.00004,0.00010,0.00006,0.00000,...,0.00128,0.00304,0.00126,0.00013,0.00008,0.00106,london,europe,51.50000,-0.20000


## Initial Neural Networks Test - 02/04/2025