In [35]:
import pandas as pandas
import json as json
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import classification_report
import joblib as joblib
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

### Load data

In [36]:
data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
data_test = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
data_train_full.head(5)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_own-child,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26890,3.0,16.0,5,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
933,3.0,14.0,5,4.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
25596,2.0,10.0,5,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12949,2.0,13.0,5,0.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6681,1.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


- set data

In [37]:
X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test.drop(['label'], axis='columns')
y_test = data_test['label']

### SMOTE

- Original shape

In [38]:
print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

Original training data shape: (16720, 48)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64


In [39]:
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)



- Resampled

In [40]:
print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

New resampled training data shape: (19438, 48)
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64


### Train model

In [41]:
best_params = {
    'n_estimators': 102,
    'learning_rate': 0.2699,
    'num_leaves': 22,
    'max_depth': 5,
    'reg_alpha': 8.7e-06,
    'reg_lambda': 3.58,
    'subsample': 0.986,
    'colsample_bytree': 0.755,
    # ADD THESE 2 LINES ONLY
    'min_child_samples': 1,      # ← was 20 → kills splits
    'min_split_gain': 0.0,       # ← allow any tiny gain
    'verbosity': -1              # ← silence warnings
}

In [42]:
best_threshold = 0.4675
scale_pos_weight =  1.3882


In [43]:
final_model_lgbm = lgb.LGBMClassifier(
    scale_pos_weight=scale_pos_weight, 
    **best_params,
    random_state=42,
)
final_model_lgbm.fit(X_full, y_full)

### Evaluate result

In [44]:
y_pred_probs = final_model_lgbm.predict_proba(X_test)[:, 1]

In [45]:
y_final_pred = (y_pred_probs >= best_threshold).astype(int)

In [46]:
final_f1 = f1_score(y_test, y_final_pred)
final_recall = recall_score(y_test, y_final_pred)
final_precision = precision_score(y_test, y_final_pred)

In [None]:
report = classification_report(
    y_test, y_final_pred, digits=4, output_dict=True
    )

df_score_rf = pandas.DataFrame(report).transpose()
df_score_rf

Unnamed: 0,precision,recall,f1-score,support
0.0,0.894936,0.782699,0.835063,2416.0
1.0,0.746009,0.87415,0.805012,1764.0
accuracy,0.821292,0.821292,0.821292,0.821292
macro avg,0.820472,0.828424,0.820037,4180.0
weighted avg,0.832087,0.821292,0.822381,4180.0


In [50]:
data_test.to_csv('./results/predictions.SMOTE_lgbm_find_tune.csv')
joblib.dump(
    value = final_model_lgbm,
    filename = './model/SMOTE-lgbm&FindTune.joblib'
)
with open('./results/scores.SMOTE-lgbm&FindTune.json','w')as f:
    json.dump(
        obj=report,
        fp=f,
        indent = 4
    )