In [1]:
import pandas as pd

In [2]:
# format change taxonomic data (from .group to .csv) 
# for now only mpa4_class.profile is considered
taxonomy_df = pd.read_csv("data/mpa4_class.profile", sep="\t")
print(taxonomy_df.head())

# format change metadata (from .group to .csv)
metadata_df = pd.read_csv("data/sample.group", sep="\t", header=None)
print(metadata_df)
print(metadata_df.info())

                     name    850945    907005    907544    907995    910252  \
0           c__Clostridia  51.07453  19.25384  34.13202  40.45823  63.65824   
1          c__Bacteroidia  42.97830  67.00596  26.10095  49.82172  26.47992   
2        c__Negativicutes   2.08592   2.70182   0.41849   2.10271   3.52240   
3  c__Gammaproteobacteria   1.28249   9.25834  36.41831   0.00000   0.42619   
4   c__Betaproteobacteria   0.93501   0.95815   0.00000   3.79642   0.46030   

     917369    920071     A0002     A0005  ...  self_CRC23  self_CRC24  \
0  43.67910  48.12092  43.68023   6.24566  ...    53.93475    34.40354   
1  44.99695  30.36404  50.71171  88.15693  ...    26.12067     7.49449   
2   2.13157   8.24484   0.47678   0.81488  ...     6.59048     1.46674   
3   5.68301   0.00579   0.00000   0.20769  ...     1.93735     0.20606   
4   0.93574   1.52207   0.34718   1.96452  ...     0.04980     0.08714   

   self_CRC26  self_CRC28  self_CRC30  self_CRC31  self_CRC34  self_CRC35  \
0  

In [3]:
# rename and match metadata and taxonomic data for merging

taxonomy_df.rename(columns={taxonomy_df.columns[0]: "Taxa"}, inplace=True)
print(taxonomy_df.head())

metadata_df.columns = [
    "SampleID", "Group", "Group_1", "Group_2", "LevelB", "LevelA",
    "GroupDetails", "Project", "Project_1", "Project_2", "Unused_1",
    "Unused_2", "Unused_3", "Age", "Gender", "BMI", "Location",
    "LocationDetails", "SequenceType", "DNAExtractType"
]
print(metadata_df.head())

# verify match
print(set(taxonomy_df.columns[1:]).difference(metadata_df["SampleID"]))

                     Taxa    850945    907005    907544    907995    910252  \
0           c__Clostridia  51.07453  19.25384  34.13202  40.45823  63.65824   
1          c__Bacteroidia  42.97830  67.00596  26.10095  49.82172  26.47992   
2        c__Negativicutes   2.08592   2.70182   0.41849   2.10271   3.52240   
3  c__Gammaproteobacteria   1.28249   9.25834  36.41831   0.00000   0.42619   
4   c__Betaproteobacteria   0.93501   0.95815   0.00000   3.79642   0.46030   

     917369    920071     A0002     A0005  ...  self_CRC23  self_CRC24  \
0  43.67910  48.12092  43.68023   6.24566  ...    53.93475    34.40354   
1  44.99695  30.36404  50.71171  88.15693  ...    26.12067     7.49449   
2   2.13157   8.24484   0.47678   0.81488  ...     6.59048     1.46674   
3   5.68301   0.00579   0.00000   0.20769  ...     1.93735     0.20606   
4   0.93574   1.52207   0.34718   1.96452  ...     0.04980     0.08714   

   self_CRC26  self_CRC28  self_CRC30  self_CRC31  self_CRC34  self_CRC35  \
0  

ValueError: Length mismatch: Expected axis has 21 elements, new values have 20 elements

In [4]:
# installing xgboost
!pip install xgboost



In [5]:
# attempt to set up XGBoost without linking taxonomic data and metadata
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the taxonomic dataset (mpa4_class.profile)
data = pd.read_csv("data/mpa4_class.profile", sep="\t")

data.index = data.index.astype(str)

# Transpose the dataset to have samples as rows and features as columns
data_transposed = data.set_index('name').transpose()
print(data_transposed.head())

name    c__Clostridia  c__Bacteroidia  c__Negativicutes  \
850945       51.07453        42.97830           2.08592   
907005       19.25384        67.00596           2.70182   
907544       34.13202        26.10095           0.41849   
907995       40.45823        49.82172           2.10271   
910252       63.65824        26.47992           3.52240   

name    c__Gammaproteobacteria  c__Betaproteobacteria  c__Bacilli  \
850945                 1.28249                0.93501     0.79336   
907005                 9.25834                0.95815     0.00000   
907544                36.41831                0.00000     2.22265   
907995                 0.00000                3.79642     0.10125   
910252                 0.42619                0.46030     3.11786   

name    c__CFGB1340  c__CFGB1424  c__Erysipelotrichia  \
850945      0.23746      0.18811              0.17254   
907005      0.00000      0.00000              0.04610   
907544      0.00000      0.00000              0.32344   
90

In [6]:
# SAME BUT FOR SPECIES
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the taxonomic dataset (mpa4_class.profile)
data_species = pd.read_csv("data/mpa4_species.profile", sep="\t")

data_species.index = data_species.index.astype(str)

# Transpose the dataset to have samples as rows and features as columns
data_species_transposed = data_species.set_index('name').transpose()
print(data_species_transposed.head())

row_sums = data_species_transposed.iloc[:, 1:].sum(axis=1)
#print("Row sums excluding the first column:", row_sums)

#row_sum = data_species_transposed[1, 1:].sum()
print(row_sums)

name    s__Phocaeicola_plebeius  s__Faecalibacterium_prausnitzii  \
850945                 28.82685                         13.15046   
907005                  0.00370                          1.37566   
907544                  0.00000                          0.03030   
907995                  0.00000                          6.24475   
910252                  1.47083                         11.80280   

name    s__Ruminococcus_sp_NSJ_71  s__Eubacterium_rectale  \
850945                    4.62748                 3.26532   
907005                    4.66365                 0.20471   
907544                    0.00000                 0.01361   
907995                    0.00000                 0.00000   
910252                    0.47843                 1.91681   

name    s__Bacteroides_uniformis  s__Clostridium_sp_AF15_49  \
850945                   3.23974                    3.15212   
907005                  13.64774                    0.00000   
907544                   0.02348   

In [7]:
print(data_species_transposed.columns)


Index(['s__Phocaeicola_plebeius', 's__Faecalibacterium_prausnitzii',
       's__Ruminococcus_sp_NSJ_71', 's__Eubacterium_rectale',
       's__Bacteroides_uniformis', 's__Clostridium_sp_AF15_49',
       's__Lachnospira_eligens', 's__Roseburia_sp_AF02_12',
       's__Phocaeicola_vulgatus', 's__Ruminococcus_bicirculans',
       ...
       's__Rodentibacter_myodis', 's__Rhodococcus_hoagii',
       's__Pseudomonas_psychrophila', 's__Pseudomonas_sp_DG56_2',
       's__Providencia_rustigianii', 's__Pseudomonas_vranovensis',
       's__Pseudomonas_taetrolens', 's__Pseudomonas_deceptionensis',
       's__Desulfobulbus_oralis', 's__Bacteroides_reticulotermitis'],
      dtype='object', name='name', length=2557)


In [8]:
print(data_species_transposed.loc['self_CRC35'])

name
s__Phocaeicola_plebeius            0.00000
s__Faecalibacterium_prausnitzii    1.77137
s__Ruminococcus_sp_NSJ_71          0.00000
s__Eubacterium_rectale             0.00000
s__Bacteroides_uniformis           1.30557
                                    ...   
s__Pseudomonas_vranovensis         0.00000
s__Pseudomonas_taetrolens          0.00000
s__Pseudomonas_deceptionensis      0.00000
s__Desulfobulbus_oralis            0.00000
s__Bacteroides_reticulotermitis    0.00000
Name: self_CRC35, Length: 2557, dtype: float64


In [9]:
species_columns = [col for col in data_species_transposed.columns if col.startswith('s__')]
row_sums_corrected = data_species_transposed[species_columns].sum(axis=1)

print(row_sums_corrected['self_CRC35'])  # Should be 100 if normalized correctly

100.0


In [10]:

# Extract labels (y) - ensure you replace 'label_column' with your actual label column
labels = data_species_transposed.index.str.contains('self_CRC') 
y = labels.astype(int)  # Convert boolean to integer (1 for CRC, 0 for others)

# Drop the index to form the feature matrix (X)
X = data_species_transposed.reset_index(drop=True)
display(X)
display(y)

name,s__Phocaeicola_plebeius,s__Faecalibacterium_prausnitzii,s__Ruminococcus_sp_NSJ_71,s__Eubacterium_rectale,s__Bacteroides_uniformis,s__Clostridium_sp_AF15_49,s__Lachnospira_eligens,s__Roseburia_sp_AF02_12,s__Phocaeicola_vulgatus,s__Ruminococcus_bicirculans,...,s__Rodentibacter_myodis,s__Rhodococcus_hoagii,s__Pseudomonas_psychrophila,s__Pseudomonas_sp_DG56_2,s__Providencia_rustigianii,s__Pseudomonas_vranovensis,s__Pseudomonas_taetrolens,s__Pseudomonas_deceptionensis,s__Desulfobulbus_oralis,s__Bacteroides_reticulotermitis
0,28.82685,13.15046,4.62748,3.26532,3.23974,3.15212,3.07619,2.72283,2.56319,2.43162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
1,0.00370,1.37566,4.66365,0.20471,13.64774,0.00000,0.05207,0.01113,20.61260,0.81157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
2,0.00000,0.03030,0.00000,0.01361,0.02348,0.00000,0.00000,0.00000,25.96778,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
3,0.00000,6.24475,0.00000,0.00000,0.10474,0.01345,0.00000,0.00000,40.44344,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
4,1.47083,11.80280,0.47843,1.91681,2.90833,5.65565,1.90521,0.00000,0.72137,0.75833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8621,0.00000,4.64376,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.07603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02565
8622,0.00000,1.06045,0.00000,0.00000,2.01819,0.00000,0.16803,0.00000,0.26589,0.27441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
8623,0.00000,1.77137,0.00000,0.00000,1.30557,0.00000,0.00000,0.00000,28.26469,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000
8624,17.24679,4.64966,0.00000,0.00000,0.33105,0.00000,0.00000,0.00000,9.23750,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000


array([0, 0, 0, ..., 1, 1, 1], shape=(8626,))

In [11]:
#X = pd.get_dummies(X, drop_first=True)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Convert to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',       # Evaluation metric
    'max_depth': 6,                 # Maximum depth of trees
    'eta': 0.1,                     # Learning rate
    'verbosity': 1,                 # Verbosity level
    'seed': 42                      # Random seed for reproducibility
}

# Train the XGBoost model
num_round = 100
bst = xgb.train(params, dtrain, num_round)

# Predict on the test set
y_pred_prob = bst.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

Train size: (6900, 2557), Test size: (1726, 2557)


In [12]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1721
           1       0.00      0.00      0.00         5

    accuracy                           1.00      1726
   macro avg       0.50      0.50      0.50      1726
weighted avg       0.99      1.00      1.00      1726



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# what do the results tell us?
# the majority class is almost exclusively predicted, while the minority class is being ignored completely
# this might be due to a mismatch of samples/entries

In [14]:
# potential fix:
# use only species level for profiling 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Load the dataset
# Replace 'data/mpa4_class.profile' with your actual file path
data = pd.read_csv('data/mpa4_class.profile', sep="\t")


# 1. Drop the 'name' column and create features (X)
X = data.drop(columns =['name'])


# 2. Create binary target labels (y)
# Mark rows as '1' if 'name' contains 'self_CRC', otherwise '0'
y = data['name'].str.contains('self_CRC').astype(int)

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# 4. Convert to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 5. Set XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',        # Log-loss as the evaluation metric
    'max_depth': 6,                  # Tree depth
    'eta': 0.1,                      # Learning rate
    'verbosity': 1,                  # Verbose output
    'seed': 42                       # Reproducibility
}

# 6. Train the XGBoost model
num_round = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)

# 7. Make predictions on the test set
y_pred_prob = bst.predict(dtest)  # Predict probabilities
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert to binary predictions

# 8. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Train size: (148, 8626), Test size: (37, 8626)
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        37

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37

