In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier


import polars as pl
from polars import selectors as cs

from sklearn.metrics import average_precision_score
from catboost import CatBoostClassifier


In [2]:
train = pl.read_parquet('data/first_100k.parquet')
X_test = pl.read_parquet('data/second_100k_variables.parquet')
y_test = pl.read_parquet('data/second_100k_target.parquet').select('target')


In [6]:

categorical_columns = train.select(~cs.by_dtype(pl.NUMERIC_DTYPES)).columns
numerical_columns = [
    column for column in train.select(cs.by_dtype(pl.NUMERIC_DTYPES)).columns if column!='target' and column!='index'
]

In [8]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])


categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])



In [13]:
new_model = CatBoostClassifier(
    iterations=1_200,
    learning_rate=0.065,
    depth=8,
    # max_leaves=100,
)

In [14]:
pipeline = Pipeline([
    ('processor', preprocessor),
    ('extra', new_model)
])
pipeline

In [15]:
pipeline.fit(
    train.select(column for column in train.columns if column != 'target').to_pandas(),
    train.select('target').to_numpy().ravel(),
)

0:	learn: 0.6532748	total: 801ms	remaining: 15m 59s
1:	learn: 0.6181174	total: 1.05s	remaining: 10m 29s
2:	learn: 0.5882123	total: 1.39s	remaining: 9m 13s
3:	learn: 0.5625162	total: 1.66s	remaining: 8m 15s
4:	learn: 0.5405396	total: 1.92s	remaining: 7m 39s
5:	learn: 0.5215491	total: 2.2s	remaining: 7m 17s
6:	learn: 0.5041750	total: 2.46s	remaining: 6m 59s
7:	learn: 0.4899552	total: 2.76s	remaining: 6m 51s
8:	learn: 0.4782613	total: 3.08s	remaining: 6m 47s
9:	learn: 0.4682091	total: 3.49s	remaining: 6m 55s
10:	learn: 0.4602005	total: 3.99s	remaining: 7m 11s
11:	learn: 0.4519023	total: 4.24s	remaining: 6m 59s
12:	learn: 0.4449217	total: 4.49s	remaining: 6m 50s
13:	learn: 0.4395755	total: 4.69s	remaining: 6m 37s
14:	learn: 0.4341681	total: 5.08s	remaining: 6m 41s
15:	learn: 0.4293228	total: 5.32s	remaining: 6m 33s
16:	learn: 0.4252149	total: 5.56s	remaining: 6m 26s
17:	learn: 0.4217430	total: 5.8s	remaining: 6m 20s
18:	learn: 0.4189100	total: 6.02s	remaining: 6m 14s
19:	learn: 0.4163948	t

In [None]:
new_predictions = pipeline.predict(X_test.to_pandas())


In [None]:
new_score = average_precision_score(y_test.to_numpy().ravel(), new_predictions)
new_score

In [None]:
def get_score(y_true: pl.DataFrame, y_pred: pl.DataFrame, default_score: float) -> float:
    score = average_precision_score(y_true.to_numpy().ravel(), y_pred.select('prediction').to_numpy().ravel())
    return score/default_score

In [None]:
print(
    f'Tu nueva nota seria de: {get_score(y_test, new_predictions, 0.22236839166738745)}'
)