In [7]:
import pandas as pd
pd.set_option("display.float_format", "{:.2f}".format)

from scipy.io import arff

# Cargar el archivo .arff
data, meta = arff.loadarff(r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\Github\inria\phpMawTba.arff")

# Convertir a DataFrame de pandas
adult_census = pd.DataFrame(data)
adult_census.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,b'Private',226802.0,b'11th',7.0,b'Never-married',b'Machine-op-inspct',b'Own-child',b'Black',b'Male',0.0,0.0,40.0,b'United-States',b'<=50K'


In [8]:
target_name = "class"
numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"]

target = adult_census[target_name]
target = target.str.decode('utf-8')  # Decodificar bytes a string
data = adult_census[numerical_columns]

data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,25.0,0.0,0.0,40.0
1,38.0,0.0,0.0,50.0
2,28.0,0.0,0.0,40.0
3,44.0,7688.0,0.0,40.0
4,18.0,0.0,0.0,30.0


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

model = Pipeline(
    steps=[
        ("preprocessor", StandardScaler()),
        ("classifier", LogisticRegression())
    ]
)

In [13]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', LogisticRegression())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': LogisticRegression(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__C': 1.0,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 100,
 'classifier__multi_class': 'deprecated',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l2',
 'classifier__random_state': None,
 'classifier__solver': 'lbfgs',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False}

In [11]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target)
scores = cv_results['test_score']

print(
    "Acurracy socre via cross-validation: \n"
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

Acurracy socre via cross-validation: 
0.800 ± 0.003


We can also change the hyperparameter of a model after it has been created
with the `set_params` method, which is available for all scikit-learn
estimators. For example, we can set `C=1e-3`, fit and evaluate the model:

In [14]:
model.set_params(classifier__C=1e-3)
cv_results = cross_validate(model, data, target)
scores = cv_results['test_score']

print(
    "Acurracy socre via cross-validation: \n"
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

Acurracy socre via cross-validation: 
0.787 ± 0.002


When the model of interest is a `Pipeline`, the hyperparameter names are of
the form `<model_name>__<hyperparameter_name>` (note the double underscore in
the middle). In our case, `classifier` comes from the `Pipeline` definition
and `C` is the hyperparameter name of `LogisticRegression`.

In general, you can use the `get_params` method on scikit-learn models to list
all the hyperparameters with their values. For example, if you want to get all
the hyperparameter names, you can use:

In [15]:
for parameter in model.get_params():
    print(parameter)

memory
steps
verbose
preprocessor
classifier
preprocessor__copy
preprocessor__with_mean
preprocessor__with_std
classifier__C
classifier__class_weight
classifier__dual
classifier__fit_intercept
classifier__intercept_scaling
classifier__l1_ratio
classifier__max_iter
classifier__multi_class
classifier__n_jobs
classifier__penalty
classifier__random_state
classifier__solver
classifier__tol
classifier__verbose
classifier__warm_start


`.get_params()` returns a `dict` whose keys are the hyperparameter names and
whose values are the hyperparameter values. If you want to get the value of a
single hyperparameter, for example `classifier__C`, you can use:

In [16]:
model.get_params()["classifier__C"]

0.001