# Deterministic Model
This model uses the best features and paramters found in the other note book in this repository. This is used on the stream lit page #3

In [16]:
import pandas as pd
import json
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, log_loss

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('./data/epcg23.csv')

y_variables = ['DGRDG','WRKG','SALARY','OCEDRLP','DGRYR','STRTYR','STRTMN','HDMN']

# make y label

# DGRDG == 1; highest degree is bachelor
# WRKG == 'Y'; working 
# SALARY >= 1; and getting paid i.e. no internship
# OCEDRLP in {1,2}; works in field
# (DGRYR - STRTYR) < 1; job started within a year after graduation

months = (df['STRTYR'] - df['DGRYR']) * 12 + (df['STRTMN'] - df['HDMN'])

df['y'] = (
    (df['DGRDG'] == 1) &
    (df['WRKG'] == 'Y') &
    (df['SALARY'] >= 1) & (df['SALARY'] < 9999998) &
    (pd.to_numeric(df['OCEDRLP'], errors='coerce').isin([1, 2])) &
    (months.between(0, 12, inclusive='both'))
).astype(np.float32)  # better for this model


df = df.copy()

# select only those with recent bachelors drop the rest
keep = (df['DGRDG'] == 1) & (df['DGRYR'] >= 2021)
df = df.loc[keep].copy()

# drop the cols used to make y
df = df.drop(y_variables, axis=1).copy()

# drop the cols that cause memroy leaks, AI helped spot most of these
leak_vars = [
    # 1) Direct label vars
    "DGRDG","DGRYR","HDMN","STRTYR","STRTMN","WRKG","SALARY","OCEDRLP",
    "NRCHG","NRCON","NRFAM","NRLOC","NROCNA","NROT","NRPAY","NRREA","NRSEC",

    # 2) Job status / employment
    "HRSWK","WKSLYR","WKSWK","WKSYR","LFSTAT","LOOKWK","LWMN","LWYR","LWNVR",
    "NWFAM","NWILL","NWLAY","NWNOND","NWOCNA","NWOT","NWRET","NWRTYR","NWSTU",
    "PJFAM","PJHAJ","PJHRS","PJNOND","PJOCNA","PJOT","PJRET","PJRETYR","PJSTU",
    "FTPRET","FTPRTYR","WRKGP","SURV_SE","EDTP",

    # 3) Job satisfaction & benefits
    "JOBSATIS","SATADV","SATBEN","SATCHAL","SATIND","SATLOC","SATRESP","SATSAL","SATSEC","SATSOC",
    "JOBINS","JOBPENS","JOBPROFT","JOBVAC",

    # 4) Work activities
    "ACTCAP","ACTDED","ACTMGT","ACTRD","ACTRD2","ACTRDT","ACTRES","ACTTCH",
    "WAACC","WAAPRSH","WABRSH","WACOM","WADEV","WADSN","WAEMRL","WAMGMT","WAOT",
    "WAPRI","WAPROD","WAPRRD","WAPRSM","WAPRSM2","WAPRSM3","WAQM","WASALE",
    "WASCSM","WASCSM2","WASCSM3","WASVC","WATEA","WASEC",

    # 5) Employer & occupation
    "N2OCPRBG","N2OCPRMG","N3OCPR","N3OCPRNG","N3OCPRX",
    "N2OCBLST","N2OCMLST","N3OCLST","N3OCLSTX","N3OCNLST",
    "INDCODE","EMED","EMTP","EMSECDT","EMSECSM","EMSIZE","EMST_TOGA","EMUS",
    "EMRG","NEDTP","NEWBUS","PBPR21C","CARN21C","MGRNAT","MGROTH","MGRSOC",
    "SUPDIR","SUPIND","SUPWK","TELEC","TELEFR","PJWTFT","PRMBR","PROMTGI",

    # 6) Training & courses after degree
    "WKTRNI","WTRCHOC","WTREASN","WTREM","WTRLIC","WTROPPS","WTROT","WTRPERS","WTRSKL",
    "ACADV","ACCAR","ACCCEP","ACCHG","ACDRG","ACEM","ACFPT","ACGRD","ACINT",
    "ACLIC","ACOT","ACSIN","ACSKL","NACEDMG","NACEDNG",

    # 7) Survey design / admin
    "OBSNUM","SURID","SRVMODE","WTSURVY","COHORT","REFYR","BIRYR",
    # Optional: also drop TCDGCMP if present
    "TCDGCMP"
]

df = df.drop(columns=[c for c in leak_vars if c in df.columns])




# float32 mapping of objs, drop everything else that cant convert
yn_map = {'Y': 1, 'N': 0, 'y': 1, 'n': 0}
cols_to_drop = []

for col in df.columns:
    if df[col].dtype == 'object':
        s = df[col].replace(yn_map)
        converted = pd.to_numeric(s, errors='coerce') # object to NaN if failed
        # drop only if column is all NaN
        if converted.notna().sum() == 0:
            cols_to_drop.append(col)
        else:
            df[col] = converted

if cols_to_drop:
    df = df.drop(columns=cols_to_drop)

# cast the rest
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].astype('float32')

In [17]:
with open("best_params.json") as f:
    best_params = json.load(f)

with open("top_features.json") as f:
    top_features = json.load(f)

X_small = df[top_features].copy()
y = df['y'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_small, y,
    test_size=0.2,
    random_state=67,
    stratify=y
)

final_clf = XGBClassifier(
    objective='binary:logistic',
    n_jobs=1,
    tree_method='hist',
    eval_metric='logloss',
    random_state=67,
    **best_params
)

final_clf.fit(X_train, y_train)

probs_small = final_clf.predict_proba(X_test)[:, 1]
preds_small = final_clf.predict(X_test)

print(
    "-"*30 + " Reduced Model " + "-"*30,
    f"\nLog Loss: {log_loss(y_test, probs_small)}",
    f"\nAUC: {roc_auc_score(y_test, probs_small)}",
    f"\nF1 Score: {f1_score(y_test, preds_small)}",
)

------------------------------ Reduced Model ------------------------------ 
Log Loss: 0.4867325938664912 
AUC: 0.8514186701321204 
F1 Score: 0.6709677419354839


In [18]:
print(best_params)

{'alpha': 0, 'colsample_bytree': 0.66, 'lambda': 0.1, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.8}


In [19]:
print(top_features)

['GOVSUP', 'EARN', 'HDACY3', 'ND2MENG', 'CH6', 'HDGRD', 'MRGRD', 'CCST_TOGA', 'CH1218', 'UGFEM', 'CH25', 'HSYR', 'AGE', 'NBAMEBG', 'CHU2', 'CLICNOW', 'NMRMENG', 'FACSEC', 'FSHHS', 'UGFPLN', 'CLICEM', 'D2PBP21C', 'N2ACED', 'NDGMEMG', 'CCCOLPR', 'CHU2IN', 'N2ACEDX', 'NATIVE', 'CHUN12', 'N2D2MEDX']
