In [1]:
import statsmodels.api as sm
from sklearn.datasets import load_diabetes
import pandas as pd

# regression

цей спосіб вибору змінних релевантний тільки для лінійної та логістичної регресії

In [2]:
def remove_insignificant_features_reg(X_train, y_train, significance_level=0.05):
    X_train_sm = sm.add_constant(X_train)
    while True:
        model = sm.OLS(y_train, X_train_sm).fit()
        p_values = model.pvalues
        max_p_value = p_values.max()
        
        if max_p_value < significance_level:
            break
        
        feature_to_remove = p_values.idxmax()
        X_train_sm = X_train_sm.drop(columns=[feature_to_remove])
        print(f"Видалено фічу: {feature_to_remove} (p-value = {max_p_value:.4f})")
    
    return X_train_sm.drop(columns=['const'])

In [3]:
def check_p_values(X_train, y_train):
    X_train_sm = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_sm).fit()
    return model.summary()

In [4]:
diabetes = load_diabetes()
X_train = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y_train = pd.Series(diabetes.target)

In [5]:
check_p_values(X_train, y_train)

0,1,2,3
Dep. Variable:,y,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.507
Method:,Least Squares,F-statistic:,46.27
Date:,"Tue, 25 Feb 2025",Prob (F-statistic):,3.8299999999999998e-62
Time:,17:51:25,Log-Likelihood:,-2386.0
No. Observations:,442,AIC:,4794.0
Df Residuals:,431,BIC:,4839.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,152.1335,2.576,59.061,0.000,147.071,157.196
age,-10.0099,59.749,-0.168,0.867,-127.446,107.426
sex,-239.8156,61.222,-3.917,0.000,-360.147,-119.484
bmi,519.8459,66.533,7.813,0.000,389.076,650.616
bp,324.3846,65.422,4.958,0.000,195.799,452.970
s1,-792.1756,416.680,-1.901,0.058,-1611.153,26.802
s2,476.7390,339.030,1.406,0.160,-189.620,1143.098
s3,101.0433,212.531,0.475,0.635,-316.684,518.770
s4,177.0632,161.476,1.097,0.273,-140.315,494.441

0,1,2,3
Omnibus:,1.506,Durbin-Watson:,2.029
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.404
Skew:,0.017,Prob(JB):,0.496
Kurtosis:,2.726,Cond. No.,227.0


Загальні метрики моделі

* Dep. Variable (Залежна змінна): y — це цільова змінна, яку ми намагаємося передбачити.
* R-squared (Коефіцієнт детермінації): 0.518
* Означає, що модель пояснює 51.8% варіації залежної змінної.
* Adj. R-squared (Скоригований R²): 0.507
    Бере до уваги кількість предикторів і штрафує за зайві змінні. Оскільки скоригований R² трохи нижчий за звичайний R², деякі фічі можуть бути зайвими.
* F-statistic: 46.27
    Оцінює, наскільки модель значуща загалом.
* Prob (F-statistic): 3.83e-62
    Дуже мале значення (≈ 0), що означає, що хоча б одна незалежна змінна справді впливає на y.

Інформація про вибірку

* No. Observations (Кількість спостережень): 442
* Df Model (Ступені свободи моделі): 10 (кількість фіч)
* Df Residuals (Ступені свободи залишків): 431 (442 - 10 - 1)
* Log-Likelihood: -2386.0
    Чим більше (менш негативне) значення, тим краще модель підходить до даних.
* AIC (Akaike Information Criterion): 4794
* BIC (Bayesian Information Criterion): 4839
    Чим менше AIC/BIC, тим краще модель. Їх можна використовувати для порівняння моделей.

* coef (β) — коефіцієнти регресії. Покажуть, як зміна фічі на 1 одиницю змінює y.
* Std Err — стандартна помилка коефіцієнта.
* t — t-статистика (чим більше, тим значущіша змінна).
* P>|t| — p-value. Якщо p-value < 0.05, то змінна значуща.
* 95% CI — довірчий інтервал (якщо включає 0, змінна може бути незначущою).

Аналіз змінних
Значущі змінні (p-value < 0.05):
sex, bmi, bp, s5

Майже значуща (p = 0.058):
s1

Незначущі:

age, s2, s3, s4, s6

→ `Їх можна спробувати видалити`

Додаткові діагностичні тести
 - Durbin-Watson: 2.029
Близько до 2 → відсутня серйозна автокореляція.
 - Omnibus / Prob(Omnibus): 1.506 / 0.471
Тест на нормальність залишків (p > 0.05 → нормальний розподіл).
 - Jarque-Bera (JB) / Prob(JB): 1.404 / 0.496
Також перевіряє нормальність (p > 0.05 → залишки нормальні).
 - Skew (Асиметрія): 0.017
Майже симетричний розподіл.
 - Kurtosis (Ексцес): 2.726
Близько до 3 → нормальний розподіл.

In [6]:
X_train_cleaned = remove_insignificant_features_reg(X_train, y_train)

Видалено фічу: age (p-value = 0.8670)
Видалено фічу: s3 (p-value = 0.6386)
Видалено фічу: s6 (p-value = 0.3040)
Видалено фічу: s4 (p-value = 0.2619)


In [7]:
X_train_cleaned

Unnamed: 0,sex,bmi,bp,s1,s2,s5
0,0.050680,0.061696,0.021872,-0.044223,-0.034821,0.019907
1,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,-0.068332
2,0.050680,0.044451,-0.005670,-0.045599,-0.034194,0.002861
3,-0.044642,-0.011595,-0.036656,0.012191,0.024991,0.022688
4,-0.044642,-0.036385,0.021872,0.003935,0.015596,-0.031988
...,...,...,...,...,...,...
437,0.050680,0.019662,0.059744,-0.005697,-0.002566,0.031193
438,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.018114
439,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.046883
440,-0.044642,0.039062,0.001215,0.016318,0.015283,0.044529


# classification

In [8]:
import statsmodels.api as sm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
import pandas as pd
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [9]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = LogisticRegression()
scorer = make_scorer(f1_score, average='weighted')

selector = RFECV(model, step=1, cv=5, scoring=scorer)
selector.fit(X_train, y_train)

selected_features = X_train.columns[selector.support_]
X_train[selected_features]

Unnamed: 0,mean radius,mean texture,mean compactness,mean concavity,mean concave points,mean symmetry,texture error,perimeter error,area error,compactness error,...,concave points error,worst radius,worst texture,worst perimeter,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
68,9.029,17.33,0.14130,0.31300,0.04375,0.2111,1.1940,1.8850,17.67,0.086060,...,0.033220,10.310,22.65,65.50,0.14820,0.43650,1.25200,0.17500,0.4228,0.11750
181,21.090,26.57,0.28320,0.24870,0.14960,0.2395,0.7629,4.4140,81.46,0.047590,...,0.015670,26.680,33.48,176.50,0.14910,0.75840,0.67800,0.29030,0.4098,0.12840
63,9.173,13.86,0.08751,0.05988,0.02180,0.2341,2.2650,2.6080,23.52,0.039380,...,0.015600,10.010,19.23,65.59,0.09836,0.16780,0.13970,0.05087,0.3282,0.08490
248,10.650,25.22,0.07234,0.02379,0.01615,0.1897,1.4930,1.4970,16.64,0.010350,...,0.006245,12.250,35.19,77.98,0.14990,0.13980,0.11250,0.06136,0.3409,0.08147
60,10.170,14.88,0.08061,0.01084,0.01290,0.2743,1.4410,3.3120,34.62,0.010990,...,0.008193,11.020,17.45,69.86,0.12750,0.09866,0.02168,0.02579,0.3557,0.08020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,8.888,14.64,0.15310,0.08606,0.02872,0.1902,0.8522,3.1680,25.44,0.093680,...,0.017660,9.733,15.67,62.56,0.12070,0.24360,0.14340,0.04786,0.2254,0.10840
106,11.640,18.33,0.10170,0.07070,0.03485,0.1801,1.6570,2.1550,20.62,0.023100,...,0.013980,13.140,29.26,85.51,0.16880,0.26600,0.28730,0.12180,0.2806,0.09097
270,14.290,16.82,0.02675,0.00725,0.00625,0.1508,0.7198,0.8439,10.77,0.003710,...,0.003608,14.910,20.65,94.44,0.08567,0.05036,0.03866,0.03333,0.2458,0.06120
435,13.980,19.62,0.11330,0.11260,0.06463,0.1669,0.9533,1.6020,18.85,0.017910,...,0.009567,17.040,30.80,113.90,0.16130,0.35680,0.40690,0.18270,0.3179,0.10550
