In [1]:
import pandas as pd

In [19]:
# format change taxonomic data (from .group to .csv) 
# for now only mpa4_class.profile is considered
taxonomy_df = pd.read_csv("data/mpa4_species.profile", sep="\t")
print(taxonomy_df.head())

# format change metadata (from .group to .csv)
metadata_df = pd.read_csv("data/sample.group", sep="\t", header=None)
print(metadata_df)
print(metadata_df.info())

                              name    850945    907005   907544   907995  \
0          s__Phocaeicola_plebeius  28.82685   0.00370  0.00000  0.00000   
1  s__Faecalibacterium_prausnitzii  13.15046   1.37566  0.03030  6.24475   
2        s__Ruminococcus_sp_NSJ_71   4.62748   4.66365  0.00000  0.00000   
3           s__Eubacterium_rectale   3.26532   0.20471  0.01361  0.00000   
4         s__Bacteroides_uniformis   3.23974  13.64774  0.02348  0.10474   

     910252    917369    920071    A0002    A0005  ...  self_CRC23  \
0   1.47083  18.51963   0.70765  0.01673  0.08908  ...     0.25244   
1  11.80280   5.09576  10.95203  3.15125  0.00280  ...     6.35213   
2   0.47843   0.00000   0.00000  0.01362  0.02326  ...     0.00000   
3   1.91681   1.68208   0.02227  2.17987  0.02107  ...     0.17086   
4   2.90833   3.33849   1.41162  3.82952  5.19085  ...     3.64027   

   self_CRC24  self_CRC26  self_CRC28  self_CRC30  self_CRC31  self_CRC34  \
0     0.00000     0.00000     1.63313     0.0

In [20]:
# rename and match metadata and taxonomic data for merging

taxonomy_df.rename(columns={taxonomy_df.columns[0]: "Taxa"}, inplace=True)
print(taxonomy_df.head())

metadata_df.columns = [
    "SampleID", "Group", "Group_1", "Group_2", "LevelB", "LevelA",
    "GroupDetails", "Project", "Project_1", "Project_2", "Unused_1",
    "Unused_2", "Unused_3", "Age", "Gender", "BMI", "Location",
    "LocationDetails", "SequenceType", "DNAExtractType"
]
print(metadata_df.head())

# verify match
print(set(taxonomy_df.columns[1:]).difference(metadata_df["SampleID"]))

                              Taxa    850945    907005   907544   907995  \
0          s__Phocaeicola_plebeius  28.82685   0.00370  0.00000  0.00000   
1  s__Faecalibacterium_prausnitzii  13.15046   1.37566  0.03030  6.24475   
2        s__Ruminococcus_sp_NSJ_71   4.62748   4.66365  0.00000  0.00000   
3           s__Eubacterium_rectale   3.26532   0.20471  0.01361  0.00000   
4         s__Bacteroides_uniformis   3.23974  13.64774  0.02348  0.10474   

     910252    917369    920071    A0002    A0005  ...  self_CRC23  \
0   1.47083  18.51963   0.70765  0.01673  0.08908  ...     0.25244   
1  11.80280   5.09576  10.95203  3.15125  0.00280  ...     6.35213   
2   0.47843   0.00000   0.00000  0.01362  0.02326  ...     0.00000   
3   1.91681   1.68208   0.02227  2.17987  0.02107  ...     0.17086   
4   2.90833   3.33849   1.41162  3.82952  5.19085  ...     3.64027   

   self_CRC24  self_CRC26  self_CRC28  self_CRC30  self_CRC31  self_CRC34  \
0     0.00000     0.00000     1.63313     0.0

ValueError: Length mismatch: Expected axis has 21 elements, new values have 20 elements

In [4]:
# installing xgboost
!pip install xgboost



In [21]:
# attempt to set up XGBoost without linking taxonomic data and metadata
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the taxonomic dataset (mpa4_class.profile)
data = pd.read_csv("data/mpa4_species.profile", sep="\t")

data.index = data.index.astype(str)

# Transpose the dataset to have samples as rows and features as columns
data_transposed = data.set_index('name').transpose()
print(data_transposed.head())

name    s__Phocaeicola_plebeius  s__Faecalibacterium_prausnitzii  \
850945                 28.82685                         13.15046   
907005                  0.00370                          1.37566   
907544                  0.00000                          0.03030   
907995                  0.00000                          6.24475   
910252                  1.47083                         11.80280   

name    s__Ruminococcus_sp_NSJ_71  s__Eubacterium_rectale  \
850945                    4.62748                 3.26532   
907005                    4.66365                 0.20471   
907544                    0.00000                 0.01361   
907995                    0.00000                 0.00000   
910252                    0.47843                 1.91681   

name    s__Bacteroides_uniformis  s__Clostridium_sp_AF15_49  \
850945                   3.23974                    3.15212   
907005                  13.64774                    0.00000   
907544                   0.02348   

In [8]:

# Extract labels (y) - ensure you replace 'label_column' with your actual label column
labels = data_transposed.index.str.contains('self_CRC') 
y = labels.astype(int)  # Convert boolean to integer (1 for CRC, 0 for others)

# Drop the index to form the feature matrix (X)
X = data_transposed.reset_index(drop=True)
display(X)
display(y)

name,c__Clostridia,c__Bacteroidia,c__Negativicutes,c__Gammaproteobacteria,c__Betaproteobacteria,c__Bacilli,c__CFGB1340,c__CFGB1424,c__Erysipelotrichia,c__Firmicutes_unclassified,...,c__CFGB1017,c__CFGB2546,c__Cytophagia,c__CFGB524,c__CFGB4422,c__Candidatus_Gracilibacteria_unclassified,c__CFGB497,c__CFGB1230,c__Acidobacteriia,c__Eurotiomycetes
0,51.07453,42.97830,2.08592,1.28249,0.93501,0.79336,0.23746,0.18811,0.17254,0.10786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19.25384,67.00596,2.70182,9.25834,0.95815,0.00000,0.00000,0.00000,0.04610,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34.13202,26.10095,0.41849,36.41831,0.00000,2.22265,0.00000,0.00000,0.32344,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.45823,49.82172,2.10271,0.00000,3.79642,0.10125,0.00000,0.00000,0.47637,0.21059,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,63.65824,26.47992,3.52240,0.42619,0.46030,3.11786,0.82004,0.00000,0.08038,0.31262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8621,48.55523,23.72413,3.76489,5.16120,0.07380,2.20211,0.00000,0.00000,11.74268,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8622,23.39465,37.40127,1.03932,15.39485,0.04935,20.74146,0.00000,0.00000,0.03647,0.26414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8623,47.10629,40.39927,1.21714,0.24941,0.00000,3.24044,0.00000,0.00000,1.53178,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8624,47.91039,34.96431,0.95754,0.13445,0.73767,1.37450,0.00000,0.00000,0.47432,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array([0, 0, 0, ..., 1, 1, 1], shape=(8626,))

In [9]:
#X = pd.get_dummies(X, drop_first=True)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Convert to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',       # Evaluation metric
    'max_depth': 6,                 # Maximum depth of trees
    'eta': 0.1,                     # Learning rate
    'verbosity': 1,                 # Verbosity level
    'seed': 42                      # Random seed for reproducibility
}

# Train the XGBoost model
num_round = 100
bst = xgb.train(params, dtrain, num_round)

# Predict on the test set
y_pred_prob = bst.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

Train size: (6900, 185), Test size: (1726, 185)


In [10]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1721
           1       0.00      0.00      0.00         5

    accuracy                           1.00      1726
   macro avg       0.50      0.50      0.50      1726
weighted avg       0.99      1.00      1.00      1726



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# what do the results tell us?
# the majority class is almost exclusively predicted, while the minority class is being ignored completely
# this might be due to a mismatch of samples/entries

In [18]:
# potential fix:
# use only species level for profiling 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Load the dataset
# Replace 'data/mpa4_class.profile' with your actual file path
data = pd.read_csv('data/mpa4_species.profile', sep="\t")


data_transposed = data.set_index('name').transpose()

# 1. Drop the 'name' column and create features (X)
#X = data_transposed.drop(columns =['name'])
X = data_transposed.reset_index(drop=True)

# 2. Create binary target labels (y)
# Mark rows as '1' if 'name' contains 'self_CRC', otherwise '0'
y = data['name'].str.contains('self_CRC').astype(int)

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# 4. Convert to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 5. Set XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',        # Log-loss as the evaluation metric
    'max_depth': 6,                  # Tree depth
    'eta': 0.1,                      # Learning rate
    'verbosity': 1,                  # Verbose output
    'seed': 42                       # Reproducibility
}

# 6. Train the XGBoost model
num_round = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)

# 7. Make predictions on the test set
y_pred_prob = bst.predict(dtest)  # Predict probabilities
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert to binary predictions

# 8. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


ValueError: Found input variables with inconsistent numbers of samples: [8626, 2557]