# Constrained Search

In [1]:
import os
import pandas
from typing import List, Dict, Optional, Tuple, Any
from main import analyze
import pandas as pd

In [2]:
analysis:analyze = analyze(run_feature_importance_model=False)
# Gonna take a look at the max/min on these features to help set constraints
important_stats = analysis.data[
    [
        "days_from_registration",
        "income",
        "alt_risk_score",
        "alt_risk_score_2",
        "asset_score",
    ]
].describe()
print(important_stats)

Data shape: (4700, 31)
Data dictionary shape: (31, 11)
       days_from_registration        income  alt_risk_score  alt_risk_score_2  \
count             4656.000000  4.657000e+03     4538.000000       3562.000000   
mean              2488.531787  5.783508e+04      512.856324        529.246491   
std               1321.208541  8.065972e+05       36.593588         41.304003   
min                  1.000000  2.000000e+01      408.000000        391.000000   
25%               1391.750000  3.120000e+04      493.000000        498.000000   
50%               2692.000000  4.030000e+04      515.000000        527.000000   
75%               3584.250000  5.200000e+04      536.000000        559.000000   
max               5916.000000  5.500000e+07      702.000000        687.000000   

       asset_score  
count  4628.000000  
mean   1624.710890  
std    3412.100395  
min       1.000000  
25%      17.000000  
50%      39.000000  
75%     105.000000  
max    9003.000000  


In [3]:
constraints:Dict[str,List[float]] = {
    "days_from_registration":[1,5916],
    "income":[20_000,100_000],
    "alt_risk_score":[408,702],
    "alt_risk_score_2":[391,687],
    "asset_score":[1,9_003],
}

change_cost:Dict[str,float|int] = {
    "days_from_registration":1,
    "income":1,
    "alt_risk_score":1,
    "alt_risk_score_2":1,
    "asset_score":1,
}

data:pd.DataFrame = analysis.data

In [4]:
import numpy as np
from sklearn.model_selection import ParameterGrid

df = analysis.data.copy()  # pandas DataFrame from the analyze class

quantiles = [0.05, 0.25, 0.5, 0.75, 0.95]
grid_dict = {}
for feature, (lo, hi) in constraints.items():
    series = df[feature].dropna()
    lowers = np.clip(np.quantile(series, quantiles[:3]), lo, hi)
    uppers = np.clip(np.quantile(series, quantiles[2:]), lo, hi)
    grid_dict[f"{feature}__min"] = lowers
    grid_dict[f"{feature}__max"] = uppers

def evaluate(bounds):
    mask = np.ones(len(df), dtype=bool)
    for col, (lo, hi) in bounds.items():
        mask &= df[col].between(lo, hi)
    subset = df.loc[mask]
    if subset.empty:
        return 0.0, 0
    return subset["success"].mean(), len(subset)

best = None
for params in ParameterGrid(grid_dict):
    bounds = {f: (params[f"{f}__min"], params[f"{f}__max"]) for f in constraints}
    if any(lo >= hi for lo, hi in bounds.values()):
        continue
    score, rows = evaluate(bounds)
    cost = sum(change_cost[f] * (bounds[f][1] - bounds[f][0]) for f in bounds)
    objective = score - 1e-4 * cost  # tune the penalty factor
    if not best or objective > best["objective"]:
        best = {"bounds": bounds, "score": score, "rows": rows, "cost": cost, "objective": objective}

print(best)


{'bounds': {'days_from_registration': (np.float64(2692.0), np.float64(3584.25)), 'income': (np.float64(31200.0), np.float64(40300.0)), 'alt_risk_score': (np.float64(493.0), np.float64(515.0)), 'alt_risk_score_2': (np.float64(527.0), np.float64(559.0)), 'asset_score': (np.float64(39.0), np.float64(105.0))}, 'score': np.float64(1.0), 'rows': 1, 'cost': np.float64(10112.25), 'objective': np.float64(-0.01122500000000004)}


In [5]:
import pprint
pprint.pprint(best)

{'bounds': {'alt_risk_score': (np.float64(493.0), np.float64(515.0)),
            'alt_risk_score_2': (np.float64(527.0), np.float64(559.0)),
            'asset_score': (np.float64(39.0), np.float64(105.0)),
            'days_from_registration': (np.float64(2692.0), np.float64(3584.25)),
            'income': (np.float64(31200.0), np.float64(40300.0))},
 'cost': np.float64(10112.25),
 'objective': np.float64(-0.01122500000000004),
 'rows': 1,
 'score': np.float64(1.0)}


In [11]:
long_string = ""
for i in best['bounds'].items():
    print(i)
    long_string = long_string + f"| {i[0]} | {i[1][0]} | {i[1][1]} |\n"
print(long_string)

('days_from_registration', (np.float64(2692.0), np.float64(3584.25)))
('income', (np.float64(31200.0), np.float64(40300.0)))
('alt_risk_score', (np.float64(493.0), np.float64(515.0)))
('alt_risk_score_2', (np.float64(527.0), np.float64(559.0)))
('asset_score', (np.float64(39.0), np.float64(105.0)))
| days_from_registration | 2692.0 | 3584.25 |
| income | 31200.0 | 40300.0 |
| alt_risk_score | 493.0 | 515.0 |
| alt_risk_score_2 | 527.0 | 559.0 |
| asset_score | 39.0 | 105.0 |

