# Metasub data mGPS algorithm - 31/03/2025

Here I am just trying to get the data pre-processing steps right. The idea for doing this is to get the dataset in the right format for easier analysis using neural networks.

## Pre-processing the data

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import datasets
os.chdir("/home/chandru/binp37/")
# Read the metadata for the metasub data.
complete_meta = pd.read_csv("./data/metasub/complete_metadata.csv")
taxa_abund = pd.read_csv("./data/metasub/metasub_taxa_abundance.csv")
taxa_abund = taxa_abund.drop_duplicates(subset=['uuid'])

In [14]:
# Merge the bacterial and metadata
metasub_data = pd.merge(complete_meta,taxa_abund,on='uuid')
metasub_data.shape

(4288, 3711)

In [15]:
# Remove control samples
control_cities = {'control','other_control','neg_control','other','pos_control'}
control_types = {'ctrl cities','negative_control','positive_control'}

mask = metasub_data['city'].isin(control_cities) | metasub_data['control_type'].isin(control_types)
metasub_data = metasub_data[~mask].copy()
metasub_data.shape

(4157, 3711)

In [16]:
#Re-label london boroughs
metasub_data.loc[metasub_data['city'].isin(['kensington','islington']),'city'] = 'london'
metasub_data.shape

(4157, 3711)

In [17]:
# Remove sparse sample locations and doubtful samples
city_counts = metasub_data['city'].value_counts()
small_cities = city_counts[city_counts<8].index.tolist()
remove_samples = metasub_data['city'].isin(['antarctica']+small_cities)
metasub_data = metasub_data[~remove_samples]
metasub_data.shape

(4070, 3711)

In [20]:
# Correct the identified mislabeling of data
kyiv_filter = metasub_data['city'] == 'kyiv'
metasub_data.loc[kyiv_filter,'latitude'] = metasub_data.loc[kyiv_filter,'city_latitude'] # Set all the latitude to the city_latitude
metasub_data.loc[kyiv_filter,'longitude'] = metasub_data.loc[kyiv_filter,'city_longitude'] # Set all the latitude to the city_longitutde

metasub_data.shape

(4070, 3711)

In [21]:
# Fill missing latitude and longitude values with city-level data
missing_lat = metasub_data["latitude"].isna()
missing_lon = metasub_data["longitude"].isna()
metasub_data.loc[missing_lat, "latitude"] = metasub_data.loc[missing_lat, "city_latitude"]
metasub_data.loc[missing_lon, "longitude"] = metasub_data.loc[missing_lon, "city_longitude"]
metasub_data.shape

(4070, 3711)

In [22]:
# Correction for incorrect London co-ordinates
london_filter = metasub_data['city'] == 'london'
metasub_data.loc[london_filter,'city_latitude'] = 51.50853
metasub_data.loc[london_filter,'city_longitude'] = -0.12574
metasub_data.shape

(4070, 3711)

In [3]:
df = pd.read_csv("/home/chandru/binp37/results/metasub/metasub_training_testing_data.csv")
df_new = pd.read_csv("/home/chandru/binp37/results/metasub/metasub_training_testing_data_new.csv")

In [7]:
# Fin the common columns
common_columns = set(df.columns) & set(df_new.columns)
print(f"Common columns: {common_columns}")

# Find the unique columns in each DataFrame
unique_df_columns = set(df.columns) - set(df_new.columns)
unique_df_new_columns = set(df_new.columns) - set(df.columns)
print(f"Unique columns in df: {unique_df_columns}")
print(f"Unique columns in df_new: {unique_df_new_columns}")

Common columns: {'Hymenobacter sp. PAMC 26554', 'Streptococcus suis', 'Brochothrix thermosphacta', 'Actinomyces radicidentis', 'Microbacterium sp. TPU 3598', 'Pantoea vagans', 'Pluralibacter gergoviae', 'Truepera radiovictrix', 'Corynebacterium efficiens', 'Stenotrophomonas rhizophila', 'Propionibacterium sp. oral taxon 193', 'Bacillus megaterium', 'Carnobacterium inhibens', 'Rhodococcus sp. B7740', 'Micrococcus luteus', 'Plautia stali symbiont', 'Staphylococcus epidermidis', 'Pseudomonas versuta', 'Acinetobacter pittii', 'Acinetobacter schindleri', 'Kocuria rhizophila', 'Rubrobacter xylanophilus', 'Rothia dentocariosa', 'Jeotgalibacillus malaysiensis', 'Human mastadenovirus C', 'Bradyrhizobium sp. SK17', 'Leclercia sp. LSNIH1', 'Pseudomonas putida', 'Mitsuaria sp. 7', 'Mycobacterium chimaera', 'Acinetobacter junii', 'Polaromonas naphthalenivorans', 'Klebsiella quasivariicola', 'Sphingomonas melonis', 'Enterobacter asburiae', 'Chryseobacterium taklimakanense', 'continent', 'Arsenicicoc