# Credit Scoring
## Библиотеки

In [20]:
import pandas as pd
from functools import partial
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier

## 2 Словарь переименований столбцов

In [8]:
rename = {
    "CODE_GENDER": "gender",
    "FLAG_OWN_CAR": "f_car",
    "FLAG_OWN_REALTY": "f_realty",
    "CNT_CHILDREN": "cnt_children",
    "AMT_INCOME_TOTAL": "total_income",
    "NAME_INCOME_TYPE": "income_type",
    "NAME_EDUCATION_TYPE": "education",
    "NAME_FAMILY_STATUS": "family_status",
    "NAME_HOUSING_TYPE": "housing",
    "DAYS_BIRTH": "birthday",
    "DAYS_EMPLOYED": "employment_days",
    "FLAG_MOBIL": "f_mobile",
    "FLAG_WORK_PHONE": "f_work_phone",
    "FLAG_PHONE": "f_phone",
    "FLAG_EMAIL": "f_email",
    "OCCUPATION_TYPE": "occupation",
    "CNT_FAM_MEMBERS": "cnt_fam_members",
}

## 3 Кастомный трансформер `OccupationImputer`

Заполняет NaN в колонке `occupation` безопасно для кросс‑валидации.

In [21]:
class OccupationImputer(BaseEstimator, TransformerMixin):

    def __init__(self, *, n_estimators: int = 300, max_depth: int | None = None,
                 random_state: int = 2005, cv: int = 5):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.cv = cv
        # внутренние
        self._rf_pipe: Pipeline | None = None
        self._cat_cols: list[str] | None = None
        # метрики
        self.accuracy_: float | None = None
        self.f1_macro_: float | None = None

    # ------------------------------------------------------------------
    def fit(self, X: pd.DataFrame, y=None):
        Xc = X.copy()
        known = Xc[Xc["occupation"].notna()]
        if known.empty:
            return self

        X_known = known.drop(columns="occupation")
        y_known = known["occupation"].astype(str)
        self._cat_cols = X_known.select_dtypes("object").columns.tolist()

        preproc = ColumnTransformer([
            ("ohe", OneHotEncoder(handle_unknown="ignore"), self._cat_cols)],
            remainder="passthrough")
        rf = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            random_state=self.random_state,
            n_jobs=-1,
        )
        self._rf_pipe = Pipeline([("prep", preproc), ("rf", rf)])
        self._rf_pipe.fit(X_known, y_known)


        cv_split = StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state)
        acc_scores = cross_val_score(self._rf_pipe, X_known, y_known,
                                         cv=cv_split, scoring="accuracy", n_jobs=-1)
        macro_f1 = make_scorer(partial(f1_score, average="macro", zero_division=0))
        f1_scores = cross_val_score(self._rf_pipe, X_known, y_known,
                                         cv=cv_split, scoring=macro_f1, n_jobs=-1)
        self.accuracy_ = float(acc_scores.mean())
        self.f1_macro_ = float(f1_scores.mean())
        return self


    def transform(self, X: pd.DataFrame):
        X = X.copy()
        if self._rf_pipe is None:
            return X  # модель не обучена
        mask = X["occupation"].isna()
        if mask.any():
            X.loc[mask, "occupation"] = self._rf_pipe.predict(
                X.loc[mask].drop(columns="occupation"))
        return X


    def get_model(self):
        return self._rf_pipe

In [10]:
def build_pipeline(cat_cols: list[str]) -> Pipeline:
    return Pipeline([
        ("occ_imp", OccupationImputer()),
        ("prep", ColumnTransformer([
            ("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols)], remainder="passthrough")),
        ("clf", LGBMClassifier(n_estimators=800, learning_rate=0.05, class_weight="balanced", random_state=2025)),
    ])

In [11]:
apps = pd.read_csv("datasets/application_record.csv")
apps.rename(columns=rename, inplace=True)
apps.drop_duplicates("ID", inplace=True)

apps["age"] = (-apps["birthday"] / 365).astype(int)
apps["employment_years"] = apps["employment_days"].apply(
    lambda x: float(-x / 365) if pd.notnull(x) and x < 0 else -1)
apps.drop(columns=["birthday", "employment_days"], inplace=True)
apps.head()

Unnamed: 0,ID,gender,f_car,f_realty,cnt_children,total_income,income_type,education,family_status,housing,f_mobile,f_work_phone,f_phone,f_email,occupation,cnt_fam_members,age,employment_years
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,,2.0,32,12.443836
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,,2.0,32,12.443836
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,2.0,58,3.106849
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8.358904
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8.358904


До импутации пропусков в occupation

In [12]:
print(apps["occupation"].isna().sum())

134193


In [22]:
imp = OccupationImputer()
apps_filled = imp.fit_transform(apps)

print(f"Accuracy:   {imp.accuracy_:.3f}")
print(f"Macro-F1:    {imp.f1_macro_:.3f}")

AttributeError: 'functools.partial' object has no attribute '__name__'

После импутации пропусков в occupation

In [14]:
print(apps_filled["occupation"].isna().sum())

0


In [15]:
cred = pd.read_csv("datasets/credit_record.csv")
cred.rename(columns={"MONTHS_BALANCE": "month", "STATUS": "status"}, inplace=True)
cred["status"] = cred["status"].apply(lambda s: "good" if s in {"X", "C"} else "bad")
cred = (cred.pivot_table(index="ID", columns="status", values="month", aggfunc="size", fill_value=0)
          .reset_index())

cred.head()

status,ID,bad,good
0,5001711,3,1
1,5001712,10,9
2,5001713,0,22
3,5001714,0,15
4,5001715,0,60


In [16]:
df = apps_filled.merge(cred, on="ID", how="left")

df.head()

Unnamed: 0,ID,gender,f_car,f_realty,cnt_children,total_income,income_type,education,family_status,housing,f_mobile,f_work_phone,f_phone,f_email,occupation,cnt_fam_members,age,employment_years,bad,good
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,Managers,2.0,32,12.443836,2.0,14.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,Managers,2.0,32,12.443836,2.0,13.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,2.0,58,3.106849,7.0,23.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8.358904,2.0,3.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8.358904,0.0,5.0
