# Debug model training pipeline

# 1. Imports

## 1.1 Packages

In [1]:
import pandas as pd

## 1.2 Options

In [2]:
model_name = "RandomForestClassifier"
pca_n_components = 10
search_space = {
    "n_estimators": {
        "min": 10,
        "max": 1000,
        "sampling_type": "int",
    },
    "max_depth": {
        "min": 2,
        "max": 10,
        "sampling_type": "int",
    },
    "min_samples_split": {
        "min": 2,
        "max": 10,
        "sampling_type": "int",
    },
    "max_features": {
        "choices": ["sqrt", "log2"],
        "sampling_type": "categorical",
    },
}

## 1.3 Functions

In [3]:
from impostor_hunt_in_texts.pipelines.model_training.model_params import (
    ModelParams,
)

  from .autonotebook import tqdm as notebook_tqdm


## 1.4 Datasets

In [4]:
df_train = pd.read_parquet("../data/05_model_input/df_train_features.parquet")

In [5]:
df_train.head(5)

Unnamed: 0,id,token_feat_0,token_feat_1,token_feat_2,token_feat_3,token_feat_4,token_feat_5,token_feat_6,token_feat_7,token_feat_8,...,token_feat_3062,token_feat_3063,token_feat_3064,token_feat_3065,token_feat_3066,token_feat_3067,token_feat_3068,token_feat_3069,token_feat_3070,token_feat_3071
0,0.0,-0.235974,0.215428,0.530858,-0.108299,0.283128,-0.182255,-0.132632,0.25254,-0.070826,...,0.030745,0.001176,0.001116,0.065424,0.035792,0.233748,0.023147,-0.003413,0.009224,0.000235
1,1.0,-0.184129,0.191412,0.287647,-0.010728,0.28183,-0.010582,0.092494,0.235936,-0.020136,...,0.039444,0.001199,-0.000591,0.005231,0.026553,0.174157,-0.010427,0.021792,0.004784,0.001479
2,2.0,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.20227,0.010722,0.000356,-0.003166,-0.007968
3,3.0,-0.105008,0.288435,0.409958,-0.061246,0.352177,-0.101375,0.046904,0.460946,0.046344,...,0.072438,0.012477,-0.004805,0.020615,0.040837,0.132445,0.043434,0.011009,-0.001763,-0.003195
4,4.0,0.086316,-0.001653,0.401914,0.16326,0.281869,-0.241231,-0.07471,0.896416,-0.319564,...,0.016326,-0.071852,-0.044551,0.003944,0.028465,0.044221,-0.003464,-0.007189,-0.001023,-0.001313


# 2. Debug

In [3]:
ModelParams(
    model_name=model_name,
    pca_n_components=pca_n_components,
    search_params=search_space,
)

ModelParams(model_name='RandomForestClassifier', pca_n_components=10, search_params={'n_estimators': {'min': 10, 'max': 1000, 'sampling_type': 'int'}, 'max_depth': {'min': 2, 'max': 10, 'sampling_type': 'int'}, 'min_samples_split': {'min': 2, 'max': 10, 'sampling_type': 'int'}, 'max_features': {'choices': ['sqrt', 'log2'], 'sampling_type': 'categorical'}})