In [35]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

def create_cnn_model(input_reshape, num_classes, optimizers_func, lr_rate, lr_decay, decay_steps, decay_rate, loss_func, numfilter=16, filtersize=3, numlayercnn_per_maxpool=1, dropout_cnn=0.2, dropout_fc=0.5, padded=True):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv1D(filters=numfilter, kernel_size=filtersize, activation='relu', input_shape=input_reshape))
    if padded:
        model.add(tf.keras.layers.ZeroPadding1D(padding=(filtersize // 2)))
    for _ in range(numlayercnn_per_maxpool):
        model.add(tf.keras.layers.Conv1D(filters=numfilter, kernel_size=filtersize, activation='relu'))
        if padded:
            model.add(tf.keras.layers.ZeroPadding1D(padding=(filtersize // 2)))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
    model.add(tf.keras.layers.Dropout(dropout_cnn))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dropout(dropout_fc))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

    # Using lr_rate for learning rate and lr_decay for learning rate decay
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=lr_rate,
        decay_steps=decay_steps,  # Define the number of steps for decay
        decay_rate=decay_rate,  # Define the decay rate
        staircase=True)  # Optional: whether to decay the learning rate at discrete intervals

    optimizer = optimizers_func(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=loss_func, metrics=['accuracy'])  # Using loss_func parameter here
    return model



# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/WT2D_x.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data_transposed = data.transpose()
label_column = data_transposed.columns[-1]
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

value_counts = y.value_counts()
rare_classes = value_counts[value_counts < 2].index
y = y[~y.isin(rare_classes)]
X = X.loc[y.index]

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Define your decay_steps and decay_rate based on your dataset and requirements
decay_steps = 1000  # Replace with your desired value
decay_rate = 0.9  # Replace with your desired value

# Reshape X for CNN
X = X.values.reshape(X.shape[0], X.shape[1], 1)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Further split the training set for parameter tuning
X_train_part, _, y_train_part, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'input_reshape': [(X_train.shape[1], 1)],
    'num_classes': [len(np.unique(y))],
    'optimizers_func': [tf.keras.optimizers.Adam, tf.keras.optimizers.SGD],  # Pass optimizer classes directly
    'lr_rate': [0.001, 0.01, 0.1],
    'lr_decay': [1e-6, 1e-5, 1e-4],
    'decay_steps': [decay_steps],
    'decay_rate': [decay_rate],
    'numfilter': [16, 32],
    'filtersize': [3, 5],
    'numlayercnn_per_maxpool': [1],
    'dropout_cnn': [0.2],
    'dropout_fc': [0.4],
    'padded': [True],
    'loss_func': ['binary_crossentropy', 'sparse_categorical_crossentropy'],  # Add both loss functions
    # Include other hyperparameters you want to tune
}
# Create CNN model as a KerasClassifier
cnn_model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=32, verbose=0)

# Perform parameter tuning on the part of the training data
grid_search = GridSearchCV(estimator=cnn_model, param_grid=param_grid, cv=3, verbose=1)
grid_search.fit(X_train_part, y_train_part)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)






Fitting 3 folds for each of 144 candidates, totalling 432 fits


216 fits failed out of a total of 432.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
216 fits failed with the following error:
Traceback (most recent call last):
  File "/home/xuandai/.local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/xuandai/.local/lib/python3.7/site-packages/keras/wrappers/scikit_learn.py", line 248, in fit
    return super().fit(x, y, **kwargs)
  File "/home/xuandai/.local/lib/python3.7/site-packages/keras/wrappers/scikit_learn.py", line 175, in fit
    history = self.model.fit(x, y, **fit_args)
  File "/home/xuandai/.local/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 70, in error

Best Parameters: {'decay_rate': 0.9, 'decay_steps': 1000, 'dropout_cnn': 0.2, 'dropout_fc': 0.4, 'filtersize': 3, 'input_reshape': (144, 1), 'loss_func': 'sparse_categorical_crossentropy', 'lr_decay': 1e-06, 'lr_rate': 0.001, 'num_classes': 105, 'numfilter': 16, 'numlayercnn_per_maxpool': 1, 'optimizers_func': <class 'keras.optimizers.optimizer_experimental.adam.Adam'>, 'padded': True}


In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

def create_cnn_model(input_reshape, num_classes, optimizers_func, lr_rate, lr_decay, decay_steps, decay_rate, loss_func, numfilter=16, filtersize=3, numlayercnn_per_maxpool=1, dropout_cnn=0.2, dropout_fc=0.5, padded=True):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv1D(filters=numfilter, kernel_size=filtersize, activation='relu', input_shape=input_reshape))
    if padded:
        model.add(tf.keras.layers.ZeroPadding1D(padding=(filtersize // 2)))
    for _ in range(numlayercnn_per_maxpool):
        model.add(tf.keras.layers.Conv1D(filters=numfilter, kernel_size=filtersize, activation='relu'))
        if padded:
            model.add(tf.keras.layers.ZeroPadding1D(padding=(filtersize // 2)))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
    model.add(tf.keras.layers.Dropout(dropout_cnn))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dropout(dropout_fc))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

    # Using lr_rate for learning rate and lr_decay for learning rate decay
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=lr_rate,
        decay_steps=decay_steps,  # Define the number of steps for decay
        decay_rate=decay_rate,  # Define the decay rate
        staircase=True)  # Optional: whether to decay the learning rate at discrete intervals

    optimizer = optimizers_func(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=loss_func, metrics=['accuracy'])  # Using loss_func parameter here
    return model



# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/Quin_gut_liver_cirrhosis_x.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data_transposed = data.transpose()
label_column = data_transposed.columns[-1]
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

value_counts = y.value_counts()
rare_classes = value_counts[value_counts < 2].index
y = y[~y.isin(rare_classes)]
X = X.loc[y.index]

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Define your decay_steps and decay_rate based on your dataset and requirements
decay_steps = 1000  # Replace with your desired value
decay_rate = 0.9  # Replace with your desired value

# Reshape X for CNN
X = X.values.reshape(X.shape[0], X.shape[1], 1)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Further split the training set for parameter tuning
X_train_part, _, y_train_part, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'input_reshape': [(X_train.shape[1], 1)],
    'num_classes': [len(np.unique(y))],
    'optimizers_func': [tf.keras.optimizers.Adam, tf.keras.optimizers.SGD],  # Pass optimizer classes directly
    'lr_rate': [0.001, 0.01, 0.1],
    'lr_decay': [1e-6, 1e-5, 1e-4],
    'decay_steps': [decay_steps],
    'decay_rate': [decay_rate],
    'numfilter': [16, 32],
    'filtersize': [3, 5],
    'numlayercnn_per_maxpool': [1],
    'dropout_cnn': [0.2],
    'dropout_fc': [0.4],
    'padded': [True],
    'loss_func': ['binary_crossentropy', 'sparse_categorical_crossentropy'],  # Add both loss functions
    # Include other hyperparameters you want to tune
}
# Create CNN model as a KerasClassifier
cnn_model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=32, verbose=0)

# Perform parameter tuning on the part of the training data
grid_search = GridSearchCV(estimator=cnn_model, param_grid=param_grid, cv=3, verbose=1)
grid_search.fit(X_train_part, y_train_part)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)





Fitting 3 folds for each of 144 candidates, totalling 432 fits


216 fits failed out of a total of 432.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
216 fits failed with the following error:
Traceback (most recent call last):
  File "/home/xuandai/.local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/xuandai/.local/lib/python3.7/site-packages/keras/wrappers/scikit_learn.py", line 248, in fit
    return super().fit(x, y, **kwargs)
  File "/home/xuandai/.local/lib/python3.7/site-packages/keras/wrappers/scikit_learn.py", line 175, in fit
    history = self.model.fit(x, y, **fit_args)
  File "/home/xuandai/.local/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 70, in error

Best Parameters: {'decay_rate': 0.9, 'decay_steps': 1000, 'dropout_cnn': 0.2, 'dropout_fc': 0.4, 'filtersize': 3, 'input_reshape': (231, 1), 'loss_func': 'sparse_categorical_crossentropy', 'lr_decay': 1e-06, 'lr_rate': 0.01, 'num_classes': 102, 'numfilter': 16, 'numlayercnn_per_maxpool': 1, 'optimizers_func': <class 'keras.optimizers.optimizer_experimental.adam.Adam'>, 'padded': True}


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/metahitIBD_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Tạo mô hình SVC
svc = SVC()

# Tìm kiếm siêu tham số tốt nhất
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Đánh giá mô hình trên tập kiểm tra
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Test Accuracy: 0.9712556732223904


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/Chatelier_gut_obesity_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Tạo mô hình SVC
svc = SVC()

# Tìm kiếm siêu tham số tốt nhất
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Đánh giá mô hình trên tập kiểm tra
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Test Accuracy: 0.9304084720121029


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/Zeller_fecal_colorectal_cancer_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Tạo mô hình SVC
svc = SVC()

# Tìm kiếm siêu tham số tốt nhất
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Đánh giá mô hình trên tập kiểm tra
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Test Accuracy: 0.9455370650529501


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/Quin_gut_liver_cirrhosis_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Tạo mô hình SVC
svc = SVC()

# Tìm kiếm siêu tham số tốt nhất
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Đánh giá mô hình trên tập kiểm tra
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Test Accuracy: 0.9576399394856279


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/WT2D_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Tạo mô hình SVC
svc = SVC()

# Tìm kiếm siêu tham số tốt nhất
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Đánh giá mô hình trên tập kiểm tra
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Test Accuracy: 0.9409984871406959


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/t2d_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Tạo mô hình SVC
svc = SVC()

# Tìm kiếm siêu tham số tốt nhất
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Siêu tham số tốt nhất
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Đánh giá mô hình trên tập kiểm tra
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
Test Accuracy: 0.9273827534039334


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/metahitIBD_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [None, 42, 100],  # Đặt giá trị random_state tùy ý
    'max_features': ['auto', 'sqrt', 'log2']  # Các cách chọn số features khi tìm split node

}

# Tạo mô hình RandomForestClassifier
rf = RandomForestClassifier()
# Chia dữ liệu thành tập huấn luyện chính và phần dữ liệu nhỏ để tinh chỉnh tham số
X_train_main, X_train_tune, y_train_main, y_train_tune = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Thiết lập grid search trên phần dữ liệu nhỏ để tinh chỉnh tham số
grid_search_tune = GridSearchCV(rf, param_grid, cv=5, verbose=1)
grid_search_tune.fit(X_train_tune, y_train_tune)

# Siêu tham số tốt nhất từ phần dữ liệu nhỏ
best_params_tune = grid_search_tune.best_params_
print("Best Parameters (Tuning):", best_params_tune)

# Sử dụng toàn bộ dữ liệu huấn luyện chính để fit mô hình với siêu tham số tốt nhất từ phần tinh chỉnh
best_model_main = RandomForestClassifier(**best_params_tune)
best_model_main.fit(X_train_main, y_train_main)

# Đánh giá mô hình trên tập kiểm tra
test_score_main = best_model_main.score(X_test, y_test)
print("Test Accuracy (Final Model):", test_score_main)


Fitting 5 folds for each of 972 candidates, totalling 4860 fits




Best Parameters (Tuning): {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': None}
Test Accuracy (Final Model): 0.972768532526475


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load your data
data = pd.read_csv('/mnt/d/20232024-01-CT252-B2003731_final/20232024-01-CT252-B2003731_final/deep-metagnomics-pub-main-final/data/Chatelier_gut_obesity_x.csv')

data.drop(columns=['Unnamed: 0'], inplace=True)

# Transpose the data
data_transposed = data.transpose()

# Assuming the last column contains labels
label_column = data_transposed.columns[-1]

# Define features and labels
X = data_transposed.drop(columns=[label_column])
y = data_transposed[label_column]

# Convert labels to strings
y = y.astype(str)

# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Thiết lập các siêu tham số để thử nghiệm
param_grid = {
    
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [None, 42, 100],  # Đặt giá trị random_state tùy ý
    'max_features': ['auto', 'sqrt', 'log2']  # Các cách chọn số features khi tìm split node

}

# Tạo mô hình RandomForestClassifier
rf = RandomForestClassifier()
# Chia dữ liệu thành tập huấn luyện chính và phần dữ liệu nhỏ để tinh chỉnh tham số
X_train_main, X_train_tune, y_train_main, y_train_tune = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Thiết lập grid search trên phần dữ liệu nhỏ để tinh chỉnh tham số
grid_search_tune = GridSearchCV(rf, param_grid, cv=5, verbose=1)
grid_search_tune.fit(X_train_tune, y_train_tune)

# Siêu tham số tốt nhất từ phần dữ liệu nhỏ
best_params_tune = grid_search_tune.best_params_
print("Best Parameters (Tuning):", best_params_tune)

# Sử dụng toàn bộ dữ liệu huấn luyện chính để fit mô hình với siêu tham số tốt nhất từ phần tinh chỉnh
best_model_main = RandomForestClassifier(**best_params_tune)
best_model_main.fit(X_train_main, y_train_main)

# Đánh giá mô hình trên tập kiểm tra
test_score_main = best_model_main.score(X_test, y_test)
print("Test Accuracy (Final Model):", test_score_main)


Fitting 5 folds for each of 972 candidates, totalling 4860 fits




Best Parameters (Tuning): {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': None}
Test Accuracy (Final Model): 0.9515885022692889
