# RFECV

In [1]:
import statsmodels.api as sm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
import pandas as pd
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [4]:
model = LogisticRegression()
scorer = make_scorer(f1_score, average='weighted')

selector = RFECV(model, step=1, cv=5, scoring=scorer)
selector.fit(X_train, y_train)

selected_features = X_train.columns[selector.support_]
X_train[selected_features]

Unnamed: 0,mean radius,mean texture,mean compactness,mean concavity,mean concave points,mean symmetry,texture error,perimeter error,area error,compactness error,...,concave points error,worst radius,worst texture,worst perimeter,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
68,9.029,17.33,0.14130,0.31300,0.04375,0.2111,1.1940,1.8850,17.67,0.086060,...,0.033220,10.310,22.65,65.50,0.14820,0.43650,1.25200,0.17500,0.4228,0.11750
181,21.090,26.57,0.28320,0.24870,0.14960,0.2395,0.7629,4.4140,81.46,0.047590,...,0.015670,26.680,33.48,176.50,0.14910,0.75840,0.67800,0.29030,0.4098,0.12840
63,9.173,13.86,0.08751,0.05988,0.02180,0.2341,2.2650,2.6080,23.52,0.039380,...,0.015600,10.010,19.23,65.59,0.09836,0.16780,0.13970,0.05087,0.3282,0.08490
248,10.650,25.22,0.07234,0.02379,0.01615,0.1897,1.4930,1.4970,16.64,0.010350,...,0.006245,12.250,35.19,77.98,0.14990,0.13980,0.11250,0.06136,0.3409,0.08147
60,10.170,14.88,0.08061,0.01084,0.01290,0.2743,1.4410,3.3120,34.62,0.010990,...,0.008193,11.020,17.45,69.86,0.12750,0.09866,0.02168,0.02579,0.3557,0.08020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,8.888,14.64,0.15310,0.08606,0.02872,0.1902,0.8522,3.1680,25.44,0.093680,...,0.017660,9.733,15.67,62.56,0.12070,0.24360,0.14340,0.04786,0.2254,0.10840
106,11.640,18.33,0.10170,0.07070,0.03485,0.1801,1.6570,2.1550,20.62,0.023100,...,0.013980,13.140,29.26,85.51,0.16880,0.26600,0.28730,0.12180,0.2806,0.09097
270,14.290,16.82,0.02675,0.00725,0.00625,0.1508,0.7198,0.8439,10.77,0.003710,...,0.003608,14.910,20.65,94.44,0.08567,0.05036,0.03866,0.03333,0.2458,0.06120
435,13.980,19.62,0.11330,0.11260,0.06463,0.1669,0.9533,1.6020,18.85,0.017910,...,0.009567,17.040,30.80,113.90,0.16130,0.35680,0.40690,0.18270,0.3179,0.10550


# SelectKBest

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
X, y = load_breast_cancer(return_X_y=True)
feature_names = load_breast_cancer().feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [7]:
selector = SelectKBest(score_func=f_classif, k=5)
selector.fit(X_train, y_train)

X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

selected_features = feature_names[selector.get_support()]
print("Обрані фічі:", selected_features)

Обрані фічі: ['mean perimeter' 'mean concave points' 'worst radius' 'worst perimeter'
 'worst concave points']


In [8]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_selected, y_train)

y_pred = model.predict(X_test_selected)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9473684210526315


In [9]:
f_scores = selector.scores_
p_values = selector.pvalues_

df = pd.DataFrame({
    "Фіча": feature_names,
    "F-скору": f_scores,
    "p-значення": p_values
})

df_sorted = df.sort_values("F-скору", ascending=False)
df_sorted.head(10)

Unnamed: 0,Фіча,F-скору,p-значення
27,worst concave points,746.492117,7.751614e-98
7,mean concave points,695.179785,1.570324e-93
22,worst perimeter,681.263759,2.495901e-92
20,worst radius,645.350668,3.688574e-89
2,mean perimeter,522.489267,1.801971e-77
23,worst area,495.787667,9.798902e-75
0,mean radius,482.233945,2.566812e-73
3,mean area,423.654133,6.111971e-67
6,mean concavity,396.66237,7.411691e-64
26,worst concavity,331.330906,5.775897e-56
