In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression

from jcopml.feature_importance import mean_score_decrease
from jcopml.pipeline import cat_pipe,num_pipe
from jcopml.plot import plot_missing_value
from jcopml.tuning import grid_search_params as gsp


In [2]:
df= pd.read_csv("data/iris.csv", index_col='Id')
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.Species.value_counts()

Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: Species, dtype: int64

In [4]:
X= df.drop(columns= 'Species')
y= df.Species

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [5]:
gsp.logreg_params

{'algo__fit_intercept': [True, False],
 'algo__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [6]:
preprocessor= ColumnTransformer([
    ('numeric', num_pipe(),X_train.columns)
])

pipeline= Pipeline([
    ('prep',preprocessor),
    ('algo',LogisticRegression(multi_class= 'ovr', solver='lbfgs', n_jobs=-1, random_state=42))
])

model= GridSearchCV(pipeline,gsp.logreg_params, cv= 3, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)

print(model.best_params_)
print(model.score(X_train,y_train), model.best_score_ , model.score(X_test,y_test))

Fitting 3 folds for each of 14 candidates, totalling 42 fits
{'algo__C': 1000.0, 'algo__fit_intercept': False}
0.9583333333333334 0.9500000000000001 0.9666666666666667


In [8]:
gsp.logreg_params

{'algo__fit_intercept': [True, False],
 'algo__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [7]:
preprocessor= ColumnTransformer([
    ('numeric', num_pipe(),X_train.columns)
])

pipeline= Pipeline([
    ('prep',preprocessor),
    ('algo',LogisticRegression(multi_class= 'multinomial', solver='lbfgs', n_jobs=-1, random_state=42))
])

model= GridSearchCV(pipeline,gsp.logreg_params, cv= 3, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)

print(model.best_params_)
print(model.score(X_train,y_train), model.best_score_ , model.score(X_test,y_test))

Fitting 3 folds for each of 14 candidates, totalling 42 fits
{'algo__C': 10.0, 'algo__fit_intercept': True}
0.9833333333333333 0.9666666666666667 1.0


logistic regression bagus untuk detection seperti cancer detection, froud detection plus class_Weight nya dipake, bobotin atau class_weight nya lebih berat ke yang lebih sedikit, agar lebih fokus kesitu, seperti pada kasus classification di SVM