In [1]:
import pandas as pd

In [2]:
df_employee = pd.read_csv("2_training_data_and_feature_engineering_employee.csv")
df_employee

Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,OverTime_No,OverTime_Yes,Attrition
0,42.000000,1368.000000,28.000000,4.000000,88.000000,2.000000,2.000000,4.000000,4523.000000,4386.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
1,25.000000,309.000000,2.000000,3.000000,82.000000,3.000000,1.000000,2.000000,2187.000000,19655.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
2,29.000000,1396.000000,10.000000,3.000000,99.000000,3.000000,1.000000,3.000000,2642.000000,2755.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.000000,1.000000,0.000000,0
3,29.000000,986.000000,3.000000,2.000000,93.000000,2.000000,3.000000,3.000000,11935.000000,21526.000000,...,0.000000,0.0,0.0,1.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
4,50.000000,854.000000,1.000000,4.000000,68.000000,3.000000,5.000000,4.000000,19517.000000,24118.000000,...,0.000000,1.0,0.0,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469,26.008398,138.470606,16.173665,3.834733,57.445411,3.165267,1.000000,2.834733,2447.215954,25638.195863,...,0.000000,0.0,0.0,0.0,0.834733,0.000000,0.165267,0.165267,0.834733,1
470,39.000000,1153.087248,2.984364,3.953091,41.203273,2.984364,1.984364,2.968727,5213.106840,17768.665065,...,0.015636,0.0,0.0,0.0,0.000000,0.984364,0.000000,0.000000,1.000000,1
471,35.612788,359.576599,20.882993,3.423401,91.306394,3.423401,1.000000,1.000000,3502.615396,21874.978424,...,0.000000,0.0,0.0,0.0,0.576599,0.000000,0.000000,1.000000,0.000000,1
472,24.318104,531.314693,11.159052,3.394882,81.974408,2.210237,1.000000,2.210237,2063.137252,13912.972892,...,0.605118,0.0,0.0,0.0,0.394882,0.000000,0.000000,0.605118,0.394882,1


In [13]:
print(df_employee["Attrition"].unique())

[0 1]


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [14]:
X = df_employee.drop("Attrition", axis=1)
y = df_employee["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

logreg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=300, random_state=42))
])

for name, pipeline in [("Logistic Regression", logreg_pipeline), ("Random Forest", rf_pipeline)]:
    print(f"\n=== {name} ===")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=4))


=== Logistic Regression ===
Accuracy: 0.6947368421052632
              precision    recall  f1-score   support

           0     0.6792    0.7500    0.7129        48
           1     0.7143    0.6383    0.6742        47

    accuracy                         0.6947        95
   macro avg     0.6968    0.6941    0.6935        95
weighted avg     0.6966    0.6947    0.6937        95


=== Random Forest ===
Accuracy: 0.7578947368421053
              precision    recall  f1-score   support

           0     0.7273    0.8333    0.7767        48
           1     0.8000    0.6809    0.7356        47

    accuracy                         0.7579        95
   macro avg     0.7636    0.7571    0.7562        95
weighted avg     0.7633    0.7579    0.7564        95



In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

result_log = permutation_importance(log_reg, X_test, y_test, n_repeats=10, random_state=42)

result_log_df = pd.DataFrame({
    "Feature": X_test.columns,
    "Importance": result_log.importances_mean
})

print("\n=== Logistic Regression Feature Importance ===")
print(result_log_df.sort_values("Importance", ascending=False).round(4).to_string(index=False))

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

result_rf = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

result_rf_df = pd.DataFrame({
    "Feature": X_test.columns,
    "Importance": result_rf.importances_mean
})

print("\n=== Random Forest Feature Importance ===")
print(result_rf_df.sort_values("Importance", ascending=False).round(4).to_string(index=False))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression Feature Importance ===
                          Feature  Importance
                   YearsAtCompany      0.0674
               YearsInCurrentRole      0.0526
                    MonthlyIncome      0.0421
          EnvironmentSatisfaction      0.0211
                 DistanceFromHome      0.0200
                 StockOptionLevel      0.0137
                      MonthlyRate      0.0116
                  JobSatisfaction      0.0105
          YearsSinceLastPromotion      0.0095
         RelationshipSatisfaction      0.0053
     EducationField_Life Sciences      0.0021
                     OverTime_Yes      0.0021
                      OverTime_No      0.0021
   EducationField_Human Resources      0.0000
   JobRole_Manufacturing Director      0.0000
JobRole_Healthcare Representative      0.0000
        JobRole_Research Director      0.0000
       JobRole_Research Scientist      0.0000
  EducationField_Technical Degree      0.0000
             EducationField_Othe

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [28]:
X = df_employee.drop("Attrition", axis=1)
y = df_employee["Attrition"]  # 0 = stay, 1 = leave

# แยก numeric / categorical
num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# --------------------------
# 2. สร้าง pipeline model
# --------------------------
logreg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=2000, random_state=42))
])

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=300, random_state=42))
])

# Train ทั้งสอง model
logreg_pipeline.fit(X, y)
rf_pipeline.fit(X, y)

# --------------------------
# 3. ดึง feature names
# --------------------------
ohe_fitted = logreg_pipeline.named_steps['preprocessor'].named_transformers_['cat']
cat_features = ohe_fitted.get_feature_names_out(cat_cols).tolist()
all_features = num_cols.tolist() + cat_features

# Logistic Regression coefficients
logreg_coef = logreg_pipeline.named_steps['classifier'].coef_[0]

# Random Forest feature importance
rf_importances = rf_pipeline.named_steps['classifier'].feature_importances_

# รวมเป็น DataFrame
df_feat = pd.DataFrame({
    "Feature": all_features,
    "LogReg_Importance": np.abs(logreg_coef),
    "RF_Importance": rf_importances
})

# --------------------------
# 4. เลือกเฉพาะ feature สำคัญ
# --------------------------
threshold = 0.005  # ปรับได้
selected_features = df_feat[
    (df_feat["LogReg_Importance"] >= threshold) |
    (df_feat["RF_Importance"] >= threshold)
]["Feature"].tolist()

print(f"จำนวน feature สำคัญ: {len(selected_features)}")
print(selected_features)

# --------------------------
# 5. สร้าง DataFrame ใหม่เฉพาะ feature สำคัญ
# --------------------------
X_selected = df_employee[selected_features]

# แบ่ง train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# 6. สร้าง pipeline ใหม่และ evaluate
# --------------------------
num_cols_selected = X_selected.select_dtypes(include=["int64","float64"]).columns
cat_cols_selected = X_selected.select_dtypes(include=["object"]).columns

preprocessor_new = ColumnTransformer([
    ("num", StandardScaler(), num_cols_selected),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_selected)
])

rf_pipeline_new = Pipeline([
    ("preprocessor", preprocessor_new),
    ("classifier", RandomForestClassifier(n_estimators=300, random_state=42))
])

# Train model ใหม่
rf_pipeline_new.fit(X_train, y_train)
y_pred = rf_pipeline_new.predict(X_test)

print("\n=== Random Forest (Selected Features) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

# --------------------------
# 7. Plot feature importance
# --------------------------
# ดึง feature names หลัง preprocessing
ohe_fitted_new = preprocessor_new.named_transformers_['cat']
cat_features_new = ohe_fitted_new.get_feature_names_out(cat_cols_selected).tolist()
all_features_new = num_cols_selected.tolist() + cat_features_new

importances = rf_pipeline_new.named_steps['classifier'].feature_importances_
feat_importance_df = pd.DataFrame({
    "Feature": all_features_new,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10,8))
plt.barh(feat_importance_df["Feature"], feat_importance_df["Importance"])
plt.gca().invert_yaxis()
plt.title("Random Forest Feature Importance (Selected Features)")
plt.show()

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [19]:
df_employee

Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,...,BusinessTravel_Travel_Frequently,Department_Sales,EducationField_Life Sciences,EducationField_Marketing,JobRole_Laboratory Technician,JobRole_Sales Executive,JobRole_Sales Representative,OverTime_No,OverTime_Yes,Attrition
0,42.000000,1368.000000,28.000000,4.000000,88.000000,2.000000,2.000000,4.000000,4523.000000,4386.000000,...,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
1,25.000000,309.000000,2.000000,3.000000,82.000000,3.000000,1.000000,2.000000,2187.000000,19655.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
2,29.000000,1396.000000,10.000000,3.000000,99.000000,3.000000,1.000000,3.000000,2642.000000,2755.000000,...,0.000000,1.000000,1.000000,0.0,0.000000,0.000000,1.000000,1.000000,0.000000,0
3,29.000000,986.000000,3.000000,2.000000,93.000000,2.000000,3.000000,3.000000,11935.000000,21526.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
4,50.000000,854.000000,1.000000,4.000000,68.000000,3.000000,5.000000,4.000000,19517.000000,24118.000000,...,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469,26.008398,138.470606,16.173665,3.834733,57.445411,3.165267,1.000000,2.834733,2447.215954,25638.195863,...,0.000000,0.165267,1.000000,0.0,0.000000,0.000000,0.165267,0.165267,0.834733,1
470,39.000000,1153.087248,2.984364,3.953091,41.203273,2.984364,1.984364,2.968727,5213.106840,17768.665065,...,0.000000,0.984364,0.015636,0.0,0.015636,0.984364,0.000000,0.000000,1.000000,1
471,35.612788,359.576599,20.882993,3.423401,91.306394,3.423401,1.000000,1.000000,3502.615396,21874.978424,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,1
472,24.318104,531.314693,11.159052,3.394882,81.974408,2.210237,1.000000,2.210237,2063.137252,13912.972892,...,0.000000,0.000000,0.000000,0.0,0.605118,0.000000,0.000000,0.605118,0.394882,1


In [20]:
# ดู columns จริงที่มีใน df
print(df_employee.columns.tolist())

# ดูว่า column ไหนตรงกับ selected_columns
cols_to_drop = [col for col in selected_columns if col in df_employee.columns]
print(cols_to_drop)

['Age', 'DailyRate', 'DistanceFromHome', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'PercentSalaryHike', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'Department_Sales', 'EducationField_Life Sciences', 'EducationField_Marketing', 'JobRole_Laboratory Technician', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'OverTime_No', 'OverTime_Yes', 'Attrition']
[]


In [15]:
df_employee.to_csv("3_model_development_employee.csv", index= False)