In [2]:
import sys
from pathlib import Path
import pandas as pd

# Project root (notebook is inside /notebooks)
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Create results directory
GRAPHS_DIR = PROJECT_ROOT / "results" / "graphs"
GRAPHS_DIR.mkdir(parents=True, exist_ok=True)


from src.preprocess import preprocess
from src.model import train_and_evaluate

# Load data
RAW_DATA_PATH = PROJECT_ROOT / "data"
df = pd.read_csv(RAW_DATA_PATH / "telco_with_engineered_features.csv")


# Preprocessing
X_train, X_test, y_train, y_test, total_median = preprocess(df)

print("raw df:", df.shape[0])
print("X_train:", X_train.shape[0])
print("X_test:", X_test.shape[0])
print("Y_train:", y_train.shape[0])
print("Y_test:", y_test.shape[0])
print("total_median:", total_median, type(total_median))


# Train + evaluate (graphs saved)
model, preds, probs, best_t = train_and_evaluate(
    X_train, X_test, y_train, y_test,
    save_dir=GRAPHS_DIR
)

print("Graphs saved to:", GRAPHS_DIR)

raw df: 7043
X_train: 5634
X_test: 1409
Y_train: 5634
Y_test: 1409
total_median: 1397.475 <class 'float'>

Running Random Forest randomized search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Running LightGBM randomized search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 2069, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333280 -> initscore=-0.693389
[LightGBM] [Info] Start training from score -0.693389

Chosen model: random_forest with CV F1 = 0.6334
Best params: {'smote__sampling_strategy': 0.7, 'clf__n_estimators': 400, 'clf__min_

In [4]:
from src.Optimal_Model import BusinessParams, score_customers_full, export_powerbi_assets


model, test_preds, test_probs, best_t = train_and_evaluate(
    X_train, X_test, y_train, y_test, save_dir=GRAPHS_DIR
)

params = BusinessParams(threshold=best_t)

params = BusinessParams(threshold=best_t)

scored_full, kpi, risk_summary, insight = score_customers_full(
    model=model,
    df_full=df,
    params=params,
    total_median=total_median
)


RESULTS_DIR = PROJECT_ROOT / "results"

export_powerbi_assets(
    results_dir=RESULTS_DIR,
    scored=scored_full,
    kpi=kpi,
    risk_summary=risk_summary,
    insight=insight,
    params=params,
    prefix="full"
)





Running Random Forest randomized search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Running LightGBM randomized search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 2069, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 6208, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333280 -> initscore=-0.693389
[LightGBM] [Info] Start training from score -0.693389

Chosen model: random_forest with CV F1 = 0.6334
Best params: {'smote__sampling_strategy': 0.7, 'clf__n_estimators': 400, 'clf__min_samples_split': 2, 'clf__max_depth': 6}

Model Performance (with tuned threshold)
Chosen model: random_for

WindowsPath('c:/Users/hisuk/03.telco_churn_project/results')