# Pretrain

In [None]:
!pip install optuna
!pip install category_encoders



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import time
import lightgbm as lgb

from category_encoders import OneHotEncoder, MEstimateEncoder, CatBoostEncoder, OrdinalEncoder
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import KFold, cross_val_score, cross_validate, cross_val_predict
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_sample_weight

from xgboost import XGBClassifier, XGBRFClassifier
from optuna.samplers import GridSampler, RandomSampler, TPESampler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!ls /content/drive/
%cd "/content/drive/My Drive/"
%cd "團專"
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MyDrive
/content/drive/My Drive
/content/drive/My Drive/團專
24-0611-RandomForestClassifier推理階段.ipynb  Gradio_報告稿.gdoc
24-0611-RandomForestClassifier訓練階段.ipynb  LGBM_Gradio介面.ipynb
24-0611-團專EDA.ipynb			      test.csv
24_0621_初版Gradio介面.ipynb		      train.csv
24_0624_2版Gradio介面.ipynb		      train_encode.csv
24_0624_3版Gradio介面.ipynb		      train_encoded_oh.csv
24_0626_4版Gradio介面.ipynb		      故事敘述.gdoc
Gradio_口頭報告稿_完整最終版.txt	      補值原因.gsheet


In [None]:
train = pd.read_csv('train.csv', index_col = 'id')
test = pd.read_csv('test.csv', index_col = 'id')
y = train.pop('Exited')

In [None]:
train.columns

Index(['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary'],
      dtype='object')

In [None]:
def feature_generator(df):
  df_copy = df.copy()
  df_copy['IsNewCustomer'] = df_copy['Tenure'].apply(lambda x: 1 if x == 0 else 0)
  df_copy['IsSenior'] = df_copy['Age'].apply(lambda x: 1 if x >= 60 else 0)
  df_copy['HasBalance'] = df_copy['Balance'].apply(lambda x: 1 if x > 0 else 0)
  df_copy['IsActive_by_CreditCard'] = df_copy['HasCrCard'] * df_copy['IsActiveMember']
  df_copy['Products_Per_Tenure'] =  df_copy['Tenure'] / df_copy['NumOfProducts']
  df_copy['HighProductGroup'] = df_copy['NumOfProducts'].apply(lambda x: 1 if x >= 3 else 0)
  df_copy['AgeCat'] = np.round(df_copy.Age/20).astype('int').astype('category')

  return df_copy

In [None]:
cat_cols = ['Geography','Gender','Tenure','NumOfProducts','HasCrCard','IsActiveMember']
num_cols = ['CreditScore', 'Age','Balance','EstimatedSalary']
target = ['Exited']
features = num_cols + cat_cols

In [None]:
tr_g = feature_generator(train.iloc[:,2:])
tr_g.head(10)

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsNewCustomer,IsSenior,HasBalance,IsActive_by_CreditCard,Products_Per_Tenure,HighProductGroup,AgeCat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,668,France,Male,33.0,3,0.0,2,1,0,181449.97,0,0,0,0,1.5,0,2
1,627,France,Male,33.0,1,0.0,2,1,1,49503.5,0,0,0,1,0.5,0,2
2,678,France,Male,40.0,10,0.0,2,1,0,184866.69,0,0,0,0,5.0,0,2
3,581,France,Male,34.0,2,148882.54,1,1,1,84560.88,0,0,1,1,2.0,0,2
4,716,Spain,Male,33.0,5,0.0,2,1,1,15068.83,0,0,0,1,2.5,0,2
5,588,Germany,Male,36.0,4,131778.58,1,1,0,136024.31,0,0,1,0,4.0,0,2
6,593,France,Female,30.0,8,144772.69,1,1,0,29792.11,0,0,1,0,8.0,0,2
7,678,Spain,Male,37.0,1,138476.41,1,1,0,106851.6,0,0,1,0,1.0,0,2
8,676,France,Male,43.0,4,0.0,2,1,0,142917.13,0,0,0,0,2.0,0,2
9,583,Germany,Male,40.0,4,81274.33,1,1,1,170843.07,0,0,1,1,4.0,0,2


In [None]:
te_g = feature_generator(test.iloc[:,2:])
te_g.head(10)

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsNewCustomer,IsSenior,HasBalance,IsActive_by_CreditCard,Products_Per_Tenure,HighProductGroup,AgeCat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
165034,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75,0,0,0,0.0,1.0,0,1
165035,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27,0,0,0,0.0,2.0,0,2
165036,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09,0,0,0,0.0,3.5,0,2
165037,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57,0,0,0,0.0,8.0,0,2
165038,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0,0,0,1,0.0,10.0,0,2
165039,593,France,Female,22.0,9,0.0,2,0.0,0.0,51907.72,0,0,0,0.0,4.5,0,1
165040,682,Spain,Male,45.0,4,0.0,2,1.0,1.0,157878.67,0,0,0,1.0,2.0,0,2
165041,539,Spain,Female,47.0,8,0.0,2,1.0,1.0,126784.29,0,0,0,1.0,4.0,0,2
165042,845,France,Female,47.0,3,111096.91,1,1.0,0.0,94978.1,0,0,1,0.0,3.0,0,2
165043,645,Spain,Male,30.0,5,0.0,2,0.0,1.0,149195.44,0,0,0,0.0,2.5,0,2


In [None]:
for col in num_cols:
    sc = StandardScaler()
    tr_g[col] = sc.fit_transform(tr_g[[col]])
    te_g[col] = sc.transform(te_g[[col]])

In [None]:
encoder = OneHotEncoder(cols=['Geography', 'Gender'], use_cat_names=True)
X_train = encoder.fit_transform(tr_g)
X_test = encoder.transform(te_g)

In [None]:
X_train.head(10)

Unnamed: 0_level_0,CreditScore,Geography_France,Geography_Spain,Geography_Germany,Gender_Male,Gender_Female,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsNewCustomer,IsSenior,HasBalance,IsActive_by_CreditCard,Products_Per_Tenure,HighProductGroup,AgeCat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0.144135,1,0,0,1,0,-0.578074,3,-0.883163,2,1,0,1.369486,0,0,0,0,1.5,0,2
1,-0.367706,1,0,0,1,0,-0.578074,1,-0.883163,2,1,1,-1.254085,0,0,0,1,0.5,0,2
2,0.268974,1,0,0,1,0,0.211354,10,-0.883163,2,1,0,1.437422,0,0,0,0,5.0,0,2
3,-0.941966,1,0,0,1,0,-0.465299,2,1.486918,1,1,1,-0.557018,0,0,1,1,2.0,0,2
4,0.743362,0,1,0,1,0,-0.578074,5,-0.883163,2,1,1,-1.93877,0,0,0,1,2.5,0,2
5,-0.854578,0,0,1,1,0,-0.239748,4,1.214638,1,1,0,0.46626,0,0,1,0,4.0,0,2
6,-0.792159,1,0,0,0,1,-0.916401,8,1.421493,1,1,0,-1.646018,0,0,1,0,8.0,0,2
7,0.268974,0,1,0,1,0,-0.126973,1,1.321262,1,1,0,-0.113798,0,0,1,0,1.0,0,2
8,0.244006,1,0,0,1,0,0.54968,4,-0.883163,2,1,0,0.603314,0,0,0,0,2.0,0,2
9,-0.916998,0,0,1,1,0,0.211354,4,0.410654,1,1,1,1.158582,0,0,1,1,4.0,0,2


In [None]:
X_test.head(10)

Unnamed: 0_level_0,CreditScore,Geography_France,Geography_Spain,Geography_Germany,Gender_Male,Gender_Female,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,IsNewCustomer,IsSenior,HasBalance,IsActive_by_CreditCard,Products_Per_Tenure,HighProductGroup,AgeCat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
165034,-0.879546,1,0,0,0,1,-1.705829,2,-0.883163,2,0.0,1.0,0.962404,0,0,0,0.0,1.0,0,1
165035,0.331393,1,0,0,0,1,0.888007,2,-0.883163,1,1.0,0.0,-0.795852,0,0,0,0.0,2.0,0,2
165036,-0.005672,1,0,0,0,1,-0.465299,7,-0.883163,2,1.0,0.0,0.523083,0,0,0,0.0,3.5,0,2
165037,0.306425,1,0,0,1,0,-0.239748,8,-0.883163,1,1.0,0.0,0.026977,0,0,0,0.0,8.0,0,2
165038,1.192783,0,0,1,1,0,-0.014197,10,1.047249,1,1.0,0.0,0.533997,0,0,1,0.0,10.0,0,2
165039,-0.792159,1,0,0,0,1,-1.818604,9,-0.883163,2,0.0,0.0,-1.20628,0,0,0,0.0,4.5,0,1
165040,0.318909,0,1,0,1,0,0.775231,4,-0.883163,2,1.0,1.0,0.900803,0,0,0,1.0,2.0,0,2
165041,-1.46629,0,1,0,0,1,1.000782,8,-0.883163,2,1.0,1.0,0.282535,0,0,0,1.0,4.0,0,2
165042,2.353787,1,0,0,0,1,1.000782,3,0.885404,1,1.0,0.0,-0.349886,0,0,1,0.0,3.0,0,2
165043,-0.142995,0,1,0,1,0,-0.916401,5,-0.883163,2,0.0,1.0,0.72815,0,0,0,0.0,2.5,0,2


# train

In [None]:
best_para = {'max_depth': 5, 'learning_rate': 0.06577863542148231, 'n_estimators': 250, 'subsample': 1.0, 'colsample_bytree': 0.4}

In [None]:
sample_weight = compute_sample_weight("balanced", y)

In [None]:
final_model = lgb.LGBMClassifier(**best_para, metric='binary_logloss')
final_model.fit(X_train, y, sample_weight=sample_weight)

[LightGBM] [Info] Number of positive: 34921, number of negative: 130113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 907
[LightGBM] [Info] Number of data points in the train set: 165034, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [None]:
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder
from sklearn.utils.class_weight import compute_sample_weight
import lightgbm as lgb

# 假設您有訓練資料和模型參數
train = pd.read_csv('train.csv', index_col='id')
y = train.pop('Exited')

# 特徵生成
tr_g = feature_generator(train.drop(columns=['CustomerId', 'Surname']))
num_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
sc = StandardScaler()
tr_g[num_cols] = sc.fit_transform(tr_g[num_cols])

# OneHotEncoder 編碼
encoder = OneHotEncoder(cols=['Geography', 'Gender'], use_cat_names=True)
X_train = encoder.fit_transform(tr_g)

# 模型參數
best_para = {'max_depth': 5, 'learning_rate': 0.06577863542148231, 'n_estimators': 250,
             'subsample': 1.0, 'colsample_bytree': 0.4}

# 計算樣本權重
sample_weight = compute_sample_weight("balanced", y)

# 訓練模型
final_model = lgb.LGBMClassifier(**best_para, metric='binary_logloss')
final_model.fit(X_train, y, sample_weight=sample_weight)

# 定義特徵生成函數（與訓練時相同）
def feature_generator(df):
    df_copy = df.copy()
    df_copy['IsNewCustomer'] = df_copy['Tenure'].apply(lambda x: 1 if x == 0 else 0)
    df_copy['IsSenior'] = df_copy['Age'].apply(lambda x: 1 if x >= 60 else 0)
    df_copy['HasBalance'] = df_copy['Balance'].apply(lambda x: 1 if x > 0 else 0)
    df_copy['IsActive_by_CreditCard'] = df_copy['HasCrCard'] * df_copy['IsActiveMember']
    df_copy['Products_Per_Tenure'] = df_copy['Tenure'] / df_copy['NumOfProducts']
    df_copy['HighProductGroup'] = df_copy['NumOfProducts'].apply(lambda x: 1 if x >= 3 else 0)
    df_copy['AgeCat'] = np.round(df_copy.Age/20).astype('int').astype('category')
    return df_copy

# 定義前處理與預測函數
def predict_proba(credit_score, geography, gender, age, tenure, balance, num_of_products,
                  has_cr_card, is_active_member, estimated_salary):
    # 建立輸入資料的 DataFrame
    input_data = pd.DataFrame({
        'CreditScore': [credit_score],
        'Geography': [geography],
        'Gender': [gender],
        'Age': [age],
        'Tenure': [tenure],
        'Balance': [balance],
        'NumOfProducts': [num_of_products],
        'HasCrCard': [has_cr_card],
        'IsActiveMember': [is_active_member],
        'EstimatedSalary': [estimated_salary]
    })

    # 特徵生成
    input_data = feature_generator(input_data)

    # 數值欄位標準化
    num_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
    for col in num_cols:
      sc = StandardScaler()
      tr_g[col] = sc.fit_transform(tr_g[[col]])
      input_data[col] = sc.transform(input_data[[col]])

    # OneHotEncoder 編碼（與訓練時一致）
    encoder = OneHotEncoder(cols=['Geography', 'Gender'], use_cat_names=True)
    X_train = encoder.fit_transform(tr_g)
    input_data_encoded = encoder.transform(input_data)

    # 使用模型進行預測
    proba = final_model.predict_proba(input_data_encoded)[:, 1][0]

    return f"客戶流失機率: {proba:.2%}"

# 定義下拉式選單選項
geography_options = ['France', 'Germany', 'Spain']
gender_options = ['Male', 'Female']
tenure_options = list(range(0, 11))  # 0 到 10
num_of_products_options = [1, 2, 3, 4]
has_cr_card_options = [0, 1]
is_active_member_options = [0, 1]



[LightGBM] [Info] Number of positive: 34921, number of negative: 130113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 907
[LightGBM] [Info] Number of data points in the train set: 165034, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [None]:
# 建立預測函數
# def predict_churn(CreditScore, Geography, Gender, Age, Tenure, Balance,
#                   NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary):

#     # 轉換成模型需要的 DataFrame 格式（與訓練欄位一致）
#     input_df = pd.DataFrame([{
#         "CreditScore": CreditScore,
#         "Geography": Geography,
#         "Gender": Gender,
#         "Age": Age,
#         "Tenure": Tenure,
#         "Balance": Balance,
#         "NumOfProducts": int(NumOfProducts),
#         "HasCrCard": "有" if HasCrCard else "無",
#         "IsActiveMember": "是" if IsActiveMember else "否",
#         "EstimatedSalary": EstimatedSalary
#     }])

#     # 預測
#     prediction = final_model.predict(input_df)[0]
#     probability = final_model.predict_proba(input_df)[0][1]*100

#     # 結果轉換
#     result = "流失風險高" if prediction == 1 else "穩定客戶"
#     prob_text = f"流失機率：{probability:.2%}"

#     return f"{result}\n{prob_text}"


with gr.Blocks(title="客戶流失預測系統") as demo:
    gr.Markdown("## 📉 客戶流失預測介面")
    gr.Markdown("請依下列欄位輸入客戶資料，我們將根據機器學習模型進行預測。")

    with gr.Row():
        with gr.Column():
            credit = gr.Slider(300, 850, value=500, step=1, label="信用評分 (Credit Score)")
            geo = gr.Dropdown(choices=geography_options, label="居住國家 (Geography)")
            gender = gr.Dropdown(choices=gender_options, label="性別 (Gender)")
            age = gr.Slider(18, 100, value=30, step=1, label="年齡 (Age)")
            tenure = gr.Dropdown(choices=tenure_options, label="年資 (Tenure)")

        with gr.Column():
            products = gr.Dropdown(choices=num_of_products_options, label="產品數量 (Number of Products)")
            card = gr.Dropdown(choices=has_cr_card_options, label="是否有信用卡 (Has Credit Card)")
            active = gr.Dropdown(choices=is_active_member_options, label="是否為活躍會員 (Is Active Member)")
            balance = gr.Slider(0, 250000, value=0, step=1000, label="帳戶餘額 (Balance)")
            salary = gr.Slider(0, 200000, value=5000, step=1000, label="估計薪資 (Estimated Salary)")

    # 預測按鈕與結果分離排版
    with gr.Row():
        predict_btn = gr.Button("🔍 預測是否流失")

    with gr.Row():
        result = gr.Textbox(label="預測結果", lines=2)

    # 綁定按鈕與預測函數
    predict_btn.click(
        fn=predict_proba,
        inputs=[credit, geo, gender, age, tenure, balance, products, card, active, salary],
        outputs=result
    )

demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e2da81fc4d174c4b43.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


