In [1]:
from dataclasses import dataclass, field
import pandas as pd
from typing import Dict

In [2]:
@dataclass
class Model:
    '''
    Model parsed from source code.

    var: variable that stores the model (e.g., 'xg_reg')
    fitting_line_n: line number where the model is fitted
    name: model name (e.g., 'xgb.XGBRegressor')
    params: model hyperparameters
    '''
    var: str
    fitting_line_n: int
    name: str = None
    params: Dict[str, str] = field(default_factory=dict)

    def to_dict(self):
        d = {
            'var': self.var,
            'fitting_line_n': self.fitting_line_n,
            'name': self.name,
        }
        for param_name, param_val in self.params.items():
            d[param_name] = param_val

        return d

Read the source code and find where `fit` or `fit_transform` methods were used. Get names for variables that store fitted models and line numbers where the models are fitted.

In [3]:
with open('models.py', 'r') as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip()]

fitted_models = []
for line_n, line in enumerate(lines):
    if '.fit' in line:
        fitted_models.append(Model(var=line.split('.fit', 1)[0],
                                   fitting_line_n=line_n))

Search for initialization of the found models and parse their hyperparameters (the process can be optimized a bit by only considering the latest initialization before fitting a model).

In [4]:
for line_n, line in enumerate(lines):
    for model in fitted_models:
        if line_n < model.fitting_line_n and f'{model.var} = ' in line:
            model_init = line[len(f'{model.var} = '):]
            model_name = model_init.split('(')[0]
            model.name = model_name

            model_params = line[len(f'{model.var} = {model_name}('):-1]
            model_params = model_params.split(',')
            for param in model_params:
                param_name, param_val = map(str.strip, param.split('='))
                try:
                    param_val = float(param_val)
                except ValueError:
                    try:
                        param_val = int(param_val)
                    except ValueError:
                        pass
                model.params[param_name] = param_val

Save the results in a DataFrame for more convenient analysis. Filter `XGBRegressor` and output its descriptive statistics.

In [5]:
models_df = pd.DataFrame.from_records([m.to_dict() for m in fitted_models])
models_df

Unnamed: 0,var,fitting_line_n,name,objective,colsample_bytree,learning_rate,max_depth,alpha,n_estimators,kernel,C,gamma,degree,epsilon,coef0
0,xg_reg,2,xgb.XGBRegressor,'reg:linear',0.3,0.1,5.0,10.0,10.0,,,,,,
1,xg_reg,5,xgb.XGBRegressor,'reg:linear',0.5,0.0001,5.0,10.0,10.0,,,,,,
2,regr,9,svm.SVR,,,,,,,'poly',100.0,'auto',3.0,0.1,1.0


In [6]:
models_xgb = models_df[models_df.name.str.contains('XGBRegressor')].dropna(axis=1)
models_xgb = models_xgb[[col for col in models_xgb.columns if col not in ['var', 'fitting_line_n', 'name']]]
models_xgb.agg(['min', 'max', 'mean', 'std']).dropna(axis=1)

Unnamed: 0,colsample_bytree,learning_rate,max_depth,alpha,n_estimators
min,0.3,0.0001,5.0,10.0,10.0
max,0.5,0.1,5.0,10.0,10.0
mean,0.4,0.05005,5.0,10.0,10.0
std,0.141421,0.07064,0.0,0.0,0.0
