In [3]:
import dagshub
dagshub.init(repo_owner='AMR-ITH', repo_name='yt-comment-analyzer', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=4f305c81-34a9-4761-842c-41c55d41a4ad&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=3a9f005cba86e8e68c7990144cba795a9b7092449170cbd4f7a65c63a6ad4f97




In [8]:
# Set or create an experiment
mlflow.set_experiment("ML Algos with HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/6372c8dc1c894e3290997ce5e2b55158', creation_time=1751625225992, experiment_id='4', last_update_time=1751625225992, lifecycle_stage='active', name='ML Algos with HP Tuning', tags={}>

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier
import mlflow
import mlflow.sklearn
import optuna
from imblearn.under_sampling import RandomUnderSampler

In [9]:
df = pd.read_csv('/content/reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [11]:
# Step 1: Clean data
df = df.dropna(subset=['category'])
y = df['category'].map({-1: 2, 0: 0, 1: 1})
X_raw = df['clean_comment']

# Step 2: Train-test split BEFORE vectorization or resampling
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Vectorization only on training data
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=3000)
X_train_vect = vectorizer.fit_transform(X_train_raw)
X_test_vect = vectorizer.transform(X_test_raw)  # Note: transform only!

# Step 4: Undersampling (only on training data)
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_vect, y_train)

# # Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42, stratify=y_train_resampled)




# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_RandomUnderSampler_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)



# Step 6: Optuna objective function for LightGBM
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 10)

    model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42)
    return accuracy_score(y_test, model.fit(X_train, y_train).predict(X_test))


# Step 7: Run Optuna for LightGBM, log the best model only
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lightgbm, n_trials=30)

    # Get the best parameters and log only the best model
    best_params = study.best_params
    best_model = LGBMClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)

    # Log the best model with MLflow, passing the algo_name as "LightGBM"
    log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test)

# Run the experiment for LightGBM
run_optuna_experiment()


[I 2025-07-11 16:43:57,904] A new study created in memory with name: no-name-2b5c5202-f1b9-4a69-87cb-a5233ab63c8a


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.234656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:04,027] Trial 0 finished with value: 0.7956554685526648 and parameters: {'n_estimators': 269, 'learning_rate': 0.09177880551980942, 'max_depth': 4}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.396194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:06,923] Trial 1 finished with value: 0.5145238696640566 and parameters: {'n_estimators': 96, 'learning_rate': 0.000630701918262064, 'max_depth': 3}. Best is trial 0 with value: 0.7956554685526648.






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.264798 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:11,179] Trial 2 finished with value: 0.6226319777721647 and parameters: {'n_estimators': 61, 'learning_rate': 0.00462295650815927, 'max_depth': 10}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.254583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:19,888] Trial 3 finished with value: 0.6029300328365749 and parameters: {'n_estimators': 295, 'learning_rate': 0.003949555445037024, 'max_depth': 4}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.236710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:24,800] Trial 4 finished with value: 0.7405910583480677 and parameters: {'n_estimators': 291, 'learning_rate': 0.045537491593859385, 'max_depth': 3}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.279998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:31,955] Trial 5 finished with value: 0.5455923212932559 and parameters: {'n_estimators': 224, 'learning_rate': 0.0007734923284578938, 'max_depth': 4}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:36,181] Trial 6 finished with value: 0.6019196766860319 and parameters: {'n_estimators': 216, 'learning_rate': 0.007431738601092078, 'max_depth': 3}. Best is trial 0 with value: 0.7956554685526648.






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.242038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:38,350] Trial 7 finished with value: 0.577923718110634 and parameters: {'n_estimators': 86, 'learning_rate': 0.010137779892652004, 'max_depth': 3}. Best is trial 0 with value: 0.7956554685526648.






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.255726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:49,354] Trial 8 finished with value: 0.6021722657236676 and parameters: {'n_estimators': 168, 'learning_rate': 0.0015499213995748213, 'max_depth': 9}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:44:58,247] Trial 9 finished with value: 0.5945945945945946 and parameters: {'n_estimators': 140, 'learning_rate': 0.0016696227182034086, 'max_depth': 8}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.241899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:45:10,402] Trial 10 finished with value: 0.5764081838848194 and parameters: {'n_estimators': 246, 'learning_rate': 0.00011076977336423887, 'max_depth': 6}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.234278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:45:20,172] Trial 11 finished with value: 0.7913614549128568 and parameters: {'n_estimators': 297, 'learning_rate': 0.05466001282184537, 'max_depth': 6}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.403353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:45:29,608] Trial 12 finished with value: 0.7906036877999495 and parameters: {'n_estimators': 259, 'learning_rate': 0.06182526344699977, 'max_depth': 6}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.234718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:45:38,653] Trial 13 finished with value: 0.7370548118211669 and parameters: {'n_estimators': 271, 'learning_rate': 0.027759156332787166, 'max_depth': 5}. Best is trial 0 with value: 0.7956554685526648.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.235651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:45:47,106] Trial 14 finished with value: 0.801970194493559 and parameters: {'n_estimators': 203, 'learning_rate': 0.08568023746246958, 'max_depth': 7}. Best is trial 14 with value: 0.801970194493559.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.416814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:45:57,621] Trial 15 finished with value: 0.7173528668855772 and parameters: {'n_estimators': 195, 'learning_rate': 0.01939802448637086, 'max_depth': 8}. Best is trial 14 with value: 0.801970194493559.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.238204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:04,977] Trial 16 finished with value: 0.791108865875221 and parameters: {'n_estimators': 158, 'learning_rate': 0.09081151186162906, 'max_depth': 7}. Best is trial 14 with value: 0.801970194493559.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.272917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:15,286] Trial 17 finished with value: 0.7092700176812327 and parameters: {'n_estimators': 199, 'learning_rate': 0.02008271754700853, 'max_depth': 7}. Best is trial 14 with value: 0.801970194493559.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.254411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:21,615] Trial 18 finished with value: 0.8024753725688305 and parameters: {'n_estimators': 236, 'learning_rate': 0.09946063383230312, 'max_depth': 5}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.248414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:27,918] Trial 19 finished with value: 0.6375347309926749 and parameters: {'n_estimators': 138, 'learning_rate': 0.011963151355867526, 'max_depth': 5}. Best is trial 18 with value: 0.8024753725688305.






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.234844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:41,915] Trial 20 finished with value: 0.5862591563526143 and parameters: {'n_estimators': 236, 'learning_rate': 0.00015592309510992583, 'max_depth': 8}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.260146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:47,568] Trial 21 finished with value: 0.7890881535741349 and parameters: {'n_estimators': 199, 'learning_rate': 0.09157655672832819, 'max_depth': 5}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.230961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:46:54,913] Trial 22 finished with value: 0.733771154331902 and parameters: {'n_estimators': 271, 'learning_rate': 0.03303909505017109, 'max_depth': 4}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.237263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:01,492] Trial 23 finished with value: 0.7959080575903006 and parameters: {'n_estimators': 242, 'learning_rate': 0.08317658553640207, 'max_depth': 5}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.249646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:11,619] Trial 24 finished with value: 0.7678706744127305 and parameters: {'n_estimators': 223, 'learning_rate': 0.042405658315772914, 'max_depth': 7}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.230766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:19,312] Trial 25 finished with value: 0.6640565799444305 and parameters: {'n_estimators': 191, 'learning_rate': 0.015393179515306145, 'max_depth': 5}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.242125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:28,352] Trial 26 finished with value: 0.7522101540793129 and parameters: {'n_estimators': 246, 'learning_rate': 0.03384049283902488, 'max_depth': 6}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.295911 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:35,283] Trial 27 finished with value: 0.7976761808537509 and parameters: {'n_estimators': 213, 'learning_rate': 0.09994322435700166, 'max_depth': 5}. Best is trial 18 with value: 0.8024753725688305.






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.258717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:45,828] Trial 28 finished with value: 0.7360444556706239 and parameters: {'n_estimators': 214, 'learning_rate': 0.02544326863340062, 'max_depth': 7}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.246767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675


[I 2025-07-11 16:47:54,676] Trial 29 finished with value: 0.7903510987623137 and parameters: {'n_estimators': 177, 'learning_rate': 0.06197482163267644, 'max_depth': 9}. Best is trial 18 with value: 0.8024753725688305.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.448733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64705
[LightGBM] [Info] Number of data points in the train set: 15835, number of used features: 2385
[LightGBM] [Info] Start training from score -1.098486
[LightGBM] [Info] Start training from score -1.098675
[LightGBM] [Info] Start training from score -1.098675




🏃 View run LightGBM_RandomUnderSampler_TFIDF_Trigrams at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/4/runs/b566b25f0b8f418d8e6ea9bc023b88d5
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/4
