In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import time
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

In [None]:
CATEGORY_NAME = 'Toys_and_Games'
TEST_SIZE = 0.20
RANDOM_STATE = 42
CV_FOLDS = 3
N_JOBS = 4

In [None]:
base_path = os.path.dirname(os.getcwd())
data_dir = os.path.join(base_path, "data", "processed", CATEGORY_NAME)
data_path = os.path.join(data_dir, f"{CATEGORY_NAME.lower()}.parquet")

In [None]:
data = pd.read_parquet(data_path)
X = data.drop("class", axis=1)
y = data["class"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

In [None]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(), 'cleaned_text'),
    ('numeric', numeric_transformer, ['overall', 'helpfulness_ratio'])
])

In [None]:
pipeline_lgbm = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', lgb.LGBMClassifier(random_state=RANDOM_STATE))
])

In [None]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [15000], # Önceki deneyden en iyi sonucu aldığımız değer
    'model__n_estimators': [100, 200], # Ağaç sayısı
    'model__learning_rate': [0.1, 0.05], # Öğrenme oranı
    'model__num_leaves': [31, 50] # Her ağacın karmaşıklığı
}

In [None]:
grid_search_lgbm = GridSearchCV(pipeline_lgbm, param_grid_lgbm, cv=CV_FOLDS, scoring='f1_weighted', n_jobs=N_JOBS, verbose=2)
print(f"LightGBM için GridSearchCV başlıyor... (Bu işlem Naive Bayes'ten daha uzun sürecektir)")
start_time = time.time()
grid_search_lgbm.fit(X_train, y_train)
end_time = time.time()
duration_minutes = (end_time - start_time) / 60

In [None]:
print("\n" + "="*50)
print("LGBM HİPERPARAMETRE OPTİMİZASYONU SONUÇLARI")
print("="*50)
print(f"Toplam Süre: {duration_minutes:.2f} dakika")
print(f"En iyi F1 Skoru (Çapraz Doğrulama ile): {grid_search_lgbm.best_score_:.4f}")
print("Bulunan En İyi Parametreler:")
print(grid_search_lgbm.best_params_)
print("-" * 50)
best_lgbm_model = grid_search_lgbm.best_estimator_

In [None]:
results_file = os.path.join(base_path, "reports", "model_results_log.csv")
report = classification_report(y_test, best_lgbm_model.predict(X_test), output_dict=True)
result_data = {
    'category': f"{CATEGORY_NAME}_LGBM", # Model adını belirtmek için
    'best_cv_f1_score': grid_search_lgbm.best_score_,
    'test_accuracy': report['accuracy'],
    'test_f1_real_review': report['0']['f1-score'],
    'test_precision_real_review': report['0']['precision'],
    # ... (diğer metrikler)
    'best_params': str(grid_search_lgbm.best_params_),
    'training_time_minutes': duration_minutes
}
temp_df = pd.DataFrame([result_data])
header = not os.path.exists(results_file)
temp_df.to_csv(results_file, mode='a', header=header, index=False)
print(f"LGBM sonuçları '{results_file}' dosyasına eklendi.")

In [None]:
model_dir = os.path.join(base_path, "models", CATEGORY_NAME)
os.makedirs(model_dir, exist_ok=True)
model_filename = f"lightgbm_{CATEGORY_NAME.lower()}.joblib"
model_path = os.path.join(model_dir, model_filename)
joblib.dump(best_lgbm_model, model_path)
print(f"Eğitilmiş LightGBM modeli '{model_path}' olarak kaydedildi.")