 ## Light GBM and Smoothing Parameter

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
import numpy as np

def adjust_prediction_probabilities(y_pred_probs, recent_preds, confidence_threshold=0.9996, max_weight=0.65, min_consecutive=2, weight_per_consecutive=0.17):
    """
    Adjust predicted probabilities based on recent consecutive predictions.

    Args:
        y_pred_probs: numpy array of shape (2,), predicted probs for classes [0,1]
        recent_preds: list of recent predictions (0/1)
        confidence_threshold: no adjustment if model confidence > threshold
        max_weight: max total weight to add
        min_consecutive: minimum run length to start adjustment
        weight_per_consecutive: weight increment per consecutive prediction

    Returns:
        adjusted_probs: numpy array of adjusted probabilities summing to 1
    """
    if not recent_preds or len(recent_preds) < min_consecutive:
        return y_pred_probs

    last_class = recent_preds[-1]
    count = 1
    for i in range(len(recent_preds) - 2, -1, -1):
        if recent_preds[i] == last_class:
            count += 1
        else:
            break

    if count < min_consecutive:
        return y_pred_probs

    top_class = np.argmax(y_pred_probs)
    top_confidence = y_pred_probs[top_class]

    # Skip adjustment if model already confident
    if top_confidence >= confidence_threshold:
        return y_pred_probs

    # Calculate weight based on consecutive count
    weight = min((count - 1) * weight_per_consecutive, max_weight)

    adjusted_probs = y_pred_probs.copy()
    adjusted_probs[last_class] += weight
    other_class = 1 - last_class
    adjusted_probs[other_class] -= weight

    # Clip between 0 and 1, normalize to sum 1
    adjusted_probs = np.clip(adjusted_probs, 0, 1)
    adjusted_probs /= adjusted_probs.sum()

    return adjusted_probs

dataset_ids = [2,7,19,21,23,27,34,52,53,74,77,82,83,86,87]

save_dir = r"D:\Master Thesis New Data Set\Analysis\Model Selection"
os.makedirs(save_dir, exist_ok=True)

all_results = []

for i in dataset_ids:
    print(f"\n================== Processing Dataset {i} ==================")

    df = pd.read_csv(rf"D:\Master Thesis New Data Set\Status Reassignment Dataset\Wind Farm B\{i}_WindFarm_B.csv", delimiter=',') ## Add Power Curved Status Reassigned Data here

    df['train_test'] = df['train_test'].astype(str).str.strip().str.lower()
    df['time_stamp'] = pd.to_datetime(df['time_stamp'])
    df['hour'] = df['time_stamp'].dt.hour
    df['dayofweek'] = df['time_stamp'].dt.dayofweek
    df['month'] = df['time_stamp'].dt.month

    df['status_type_binary'] = df['status_type_id'].apply(lambda x: 0 if x == 0 else 1)

    train_df = df[df['train_test'] == 'train']
    predict_df = df[df['train_test'] == 'prediction']

    cols_to_remove = ['status_type_id', 'status_type_binary', 'time_stamp', 'id', 'asset_id', 'train_test']
    feature_cols = [col for col in df.columns if col not in cols_to_remove]

    X_train = train_df[feature_cols].copy()
    X_predict = predict_df[feature_cols].copy()
    y_train = train_df['status_type_binary']

    label_encoders = {}
    for col in X_train.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_predict[col] = le.transform(X_predict[col].astype(str))
        label_encoders[col] = le

    lgb_train = lgb.Dataset(X_train, label=y_train)
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'seed': 42
    }
    clf = lgb.train(params, lgb_train, num_boost_round=100)

    recent_preds = []
    y_pred_binary_smooth = []

    for idx in range(len(X_predict)):
        row = X_predict.iloc[idx:idx+1]
        # Get probs for classes [0,1] from LightGBM
        prob_1 = clf.predict(row)[0]
        prob_0 = 1 - prob_1
        probs = np.array([prob_0, prob_1])

        # Adjust probabilities based on recent preds
        adjusted_probs = adjust_prediction_probabilities(probs, recent_preds)

        # Final prediction
        pred_class = int(np.argmax(adjusted_probs))

        recent_preds.append(pred_class)
        if len(recent_preds) > 5:
            recent_preds.pop(0)

        y_pred_binary_smooth.append(pred_class)

    predict_df = predict_df.copy()
    predict_df['predicted_status_type_binary'] = (clf.predict(X_predict) >= 0.5).astype(int)
    predict_df['predicted_status_type_binary_smooth'] = y_pred_binary_smooth

    pred_file = os.path.join(save_dir, f"{i}_WindFarm_B_predictions_lgb_smoothed.csv")
    predict_df.to_csv(pred_file, index=False)

    actual_test_df = df[df['train_test'] == 'prediction'][['id', 'status_type_binary']]
    comparison_df = pd.merge(predict_df, actual_test_df, on='id', suffixes=('_predicted', '_actual'))

    y_true = comparison_df['status_type_binary_actual']
    y_pred_final = comparison_df['predicted_status_type_binary_smooth']

    accuracy = accuracy_score(y_true, y_pred_final)
    precision = precision_score(y_true, y_pred_final, zero_division=0)
    recall = recall_score(y_true, y_pred_final, zero_division=0)
    f1 = f1_score(y_true, y_pred_final, zero_division=0)

    all_results.append({
        'Dataset': i,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1
    })

    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"✅ Precision: {precision:.4f}")
    print(f"✅ Recall: {recall:.4f}")
    print(f"✅ F1 Score: {f1:.4f}")

results_df = pd.DataFrame(all_results)
mean_results = results_df.mean(numeric_only=True)

print("\n================== Final Summary Across All Datasets ==================")
print(f"Mean Accuracy: {mean_results['Accuracy']:.4f}")
print(f"Mean Precision: {mean_results['Precision']:.4f}")
print(f"Mean Recall: {mean_results['Recall']:.4f}")
print(f"Mean F1 Score: {mean_results['F1_Score']:.4f}")

summary_file = os.path.join(save_dir, "lightgbm_evaluation_summary_smoothed.xlsx")
results_df.to_excel(summary_file, index=False)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.




✅ Accuracy: 0.8885
✅ Precision: 0.5200
✅ Recall: 0.0524
✅ F1 Score: 0.0952

✅ Accuracy: 0.8531
✅ Precision: 0.8601
✅ Recall: 0.6439
✅ F1 Score: 0.7365

✅ Accuracy: 0.9191
✅ Precision: 0.9277
✅ Recall: 0.7523
✅ F1 Score: 0.8308

✅ Accuracy: 0.8211
✅ Precision: 0.9211
✅ Recall: 0.3889
✅ F1 Score: 0.5469

✅ Accuracy: 0.7786
✅ Precision: 0.8654
✅ Recall: 0.4845
✅ F1 Score: 0.6212

✅ Accuracy: 0.8600
✅ Precision: 0.9376
✅ Recall: 0.7076
✅ F1 Score: 0.8065

✅ Accuracy: 0.8113
✅ Precision: 0.7922
✅ Recall: 0.6198
✅ F1 Score: 0.6955

✅ Accuracy: 0.8173
✅ Precision: 0.7746
✅ Recall: 0.3884
✅ F1 Score: 0.5174

✅ Accuracy: 0.7202
✅ Precision: 0.7733
✅ Recall: 0.8372
✅ F1 Score: 0.8040

✅ Accuracy: 0.8962
✅ Precision: 0.9065
✅ Recall: 0.9071
✅ F1 Score: 0.9068

✅ Accuracy: 0.7185
✅ Precision: 0.6746
✅ Recall: 0.6805
✅ F1 Score: 0.6775

✅ Accuracy: 0.8481
✅ Precision: 0.9330
✅ Recall: 0.7073
✅ F1 Score: 0.8046

✅ Accuracy: 0.7057
✅ Precision: 0.6206
✅ Recall: 0.4233
✅ F1 Score: 0.5033

✅ Accuracy: