In [2]:
import pandas as pd
from xgboost import XGBRegressor as xgbr
from xgboost import XGBClassifier as xgbc
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, f1_score, roc_auc_score, log_loss

from datetime import datetime

import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv('./data/epcg23.csv')
df.shape

(94606, 548)

---
## Y label: 
**Probability of recent bachelor graduates getting a paid job within there field after graduation.**
1) **WRKG** Working for pay or profit during reference week
2) **OCEDRLP** Extent that principal job is related to highest degree
3) **DGRDG** Highest degree type
4) **STRTYR** Year principal job started
5) **DGRYR** Year of award of highest degree

```
* If 1, 2 and 3 is check out, and year of 4 is => year of 5 then Y=1
```

### Other relevant features  
* **LFSTAT** Labor force status
* **DGRDG** Highest degree type
* **HDMN** Month of award of highest degree
* **NDGMEMG** Field of study for highest degree (major group)
* **NDGMENG** Field of study for highest degree (minor group)
* **HDPBP21C** Public/private status of school awarding highest degree - 2021 Carnegie code
* **HDRGN** Location of school awarding highest degree (region code)
* **BAYR** Year of award of first bachelors degree
* **CLICWKR** Certificates and licenses: for work-related reasons
* **CLICNOW** Certification or licenses: for principal job
* **CLICISS** Certification or licenses: issuer
* **CLICCODE** Certification/license primary subject or field of study
* **CLICYR** Certification or licenses: year first issued
### **Demograhic Related**
* **AGE** Age
* **SEX_2023** Sex at birth
* **CTZN** Citizenship or visa status
* **CTZFOR** Visa type for non-US citizens
* **FNUSYR6** The year first came to U.S. for 6 months or longer
* **VETSTAT** Veteran status: served on active duty in the US Armed Forces, Reserves, or National Guard
### **Geogrophy Related**
* **RESPLOC** Respondent location
* **RESPLO3_TOGA** 3-Digit Respondent Location (state/country code)
* **RESPLCUS** Respondent location (U.S./Non-U.S.)
* **EMRG** Region code for employer
* **EMST_TOGA** State/country code for employer
### **Finacial Related**
* **UGLOANR** Amount borrowed to finance UNDERGRADUATE degree(s)
* **UGOWER** Amount still owed from financing of UNDERGRADUATE degree(s)
* **GRFLN** Financial support for graduate degree(s): Loans from school, banks, and government
* **SALARY** Salary

---
## Data Cleaning

In [4]:
y_variables = ['DGRDG','WRKG','SALARY','OCEDRLP','DGRYR','STRTYR','STRTMN','HDMN']
df[y_variables].dtypes

DGRDG       int64
WRKG       object
SALARY      int64
OCEDRLP    object
DGRYR       int64
STRTYR      int64
STRTMN      int64
HDMN        int64
dtype: object

In [5]:
# make y label

# DGRDG == 1; highest degree is bachelor
# WRKG == 'Y'; working 
# SALARY >= 1; and getting paid i.e. no internship
# OCEDRLP in {1,2}; works in field
# (DGRYR - STRTYR) < 1; job started within a year after graduation

months = (df['STRTYR'] - df['DGRYR']) * 12 + (df['STRTMN'] - df['HDMN'])

df['y'] = (
    (df['DGRDG'] == 1) &
    (df['WRKG'] == 'Y') &
    (df['SALARY'] >= 1) & (df['SALARY'] < 9999998) &
    (pd.to_numeric(df['OCEDRLP'], errors='coerce').isin([1, 2])) &
    (months.between(0, 12, inclusive='both'))
).astype(np.float32)  # better for this model


df = df.copy()

# select only those with recent bachelors drop the rest
keep = (df['DGRDG'] == 1) & (df['DGRYR'] >= 2021)
df = df.loc[keep].copy()

# drop the cols used to make y
df = df.drop(y_variables, axis=1).copy()

# float32 mapping of objs, drop everything else that cant convert
yn_map = {'Y': 1, 'N': 0, 'y': 1, 'n': 0}
cols_to_drop = []

for col in df.columns:
    if df[col].dtype == 'object':
        s = df[col].replace(yn_map)
        converted = pd.to_numeric(s, errors='coerce') # object to NaN if failed
        # drop only if column is all NaN
        if converted.notna().sum() == 0:
            cols_to_drop.append(col)
        else:
            df[col] = converted

if cols_to_drop:
    df = df.drop(columns=cols_to_drop)

# cast the rest
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].astype('float32')

In [6]:
df.shape

(972, 488)

In [7]:
# check to see of all cols are float32
float32 = all(df.dtypes == 'float32')
print("All float32:", float32)

All float32: True


---
## Gradient Boosted Forest: Classification w/probability

In [8]:
X, y = df.drop(columns=['y']), df['y']

# train test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# tuning parameters
params = {
    'n_estimators': [120, 280, 487],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3],
    # 'gamma': [], # loss needed to partition further 
    'learning_rate': [0.01, 0.05],
    'subsample': [0.8],
    'colsample_bytree': [0.8], # fraction of cols to use on trees
    'lambda':[1], # regulate w on the conservitive side
    'alpha':[0,0.05] # same as lambda.. but diffrent
}

clas_model = xgbc(
    objective='binary:logistic', 
    n_jobs=1, # to make sure cores are used by sklearn
    tree_method='hist',
    eval_metric='logloss' # train accorting to log loss metric
)

clas_grid_search = GridSearchCV(
    estimator=clas_model,
    param_grid=params,
    cv=5,
    scoring='neg_log_loss',
    n_jobs=11,
    # verbose=2
)

# fit model
clas_grid_search.fit(X_train, y_train)
clas_best = clas_grid_search.best_estimator_

# evaluate the best model
probs = clas_best.predict_proba(X_test)[:, 1] # turns out this can do both!!
preds = clas_best.predict(X_test) # regular 0/1 labels

ll  = log_loss(y_test, probs)
auc = roc_auc_score(y_test, probs)
f1  = f1_score(y_test, preds)

print(
    '-'*30+'Model Performance Metrics'+'-'*30,
    f'\nLog Loss: {ll}',
    f'\nAUC: {auc}',
    f'\nF1 Score: {f1}',
)

------------------------------Model Performance Metrics------------------------------ 
Log Loss: 0.45347975365227755 
AUC: 0.8430507745266782 
F1 Score: 0.7272727272727273


In [17]:
logfile = "runs.vim"
with open(logfile, "a") as f:
    f.write('-'*18 + 'Model Performance Metrics' + '-'*18 + "\n")
    f.write(f"Timestamp: {datetime.now().isoformat(timespec='seconds')}\n")
    f.write(f"Log Loss: {ll}\n")
    f.write(f"AUC: {auc}\n")
    f.write(f"F1 Score: {f1}\n")
    f.write("Best Params:\n")
    for k, v in clas_grid_search.best_params_.items():
        f.write(f"  {k}: {v}\n")
    f.write("\n")

---
## Resources:
* Perameters: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
* Example: xgboosting.com/how-to-use-xgboost-xgbregressor/
* 