In [1]:
# Ячейка 1: импорты и общие настройки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report,
                             mean_absolute_error, mean_squared_error, r2_score)

# модели
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from imblearn.over_sampling import SMOTE

# полезные
import seaborn as sns
sns.set_style('whitegrid')
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [3]:
# Ячейка 3: базовые фичи и очистка
df = df.copy()

# переименуем для удобства (если нужно)
# df.rename(columns={'default.payment.next.month':'target'}, inplace=True)

# Пример новых признаков:
# Кол-во просрочек: используем PAY_0..PAY_6 (или PAY_1..PAY_6 в вашем датасете)
pay_cols = [c for c in df.columns if c.startswith('PAY')]
df['n_late'] = (df[pay_cols] > 0).sum(axis=1)
df['max_late'] = df[pay_cols].max(axis=1)

# Отношение сумм к кредитному лимиту (если есть LIMIT_BAL)
if 'LIMIT_BAL' in df.columns:
    bill_cols = [c for c in df.columns if c.startswith('BILL_AMT')]
    pay_amt_cols = [c for c in df.columns if c.startswith('PAY_AMT')]
    df['bill_sum'] = df[bill_cols].sum(axis=1)
    df['pay_sum'] = df[pay_amt_cols].sum(axis=1)
    df['pay_rate'] = df['pay_sum'] / (df['bill_sum'] + 1e-6)
    df['bill_limit_ratio'] = df['bill_sum'] / (df['LIMIT_BAL'] + 1e-6)

# Обработаем категориальные как object, но в этом датасете они часто numeric-coded
cat_cols = ['SEX','EDUCATION','MARRIAGE']
cat_cols = [c for c in cat_cols if c in df.columns]

# Целевая
y = df['default.payment.next.month']
X = df.drop(['default.payment.next.month', 'ID'] if 'ID' in df.columns else ['default.payment.next.month'], axis=1)


In [4]:
from sklearn.model_selection import StratifiedKFold

# разделим данные один раз
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

# признаки
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
# убираем cat_cols из численных, если они кодируются как числа
num_cols = [c for c in num_cols if c not in cat_cols]
cat_cols = [c for c in cat_cols if c in X_train.columns]

# preprocess
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# модель + pipeline
pipe = Pipeline([
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=RANDOM_SEED)),   # Note: SMOTE должен применяться только к X_train; интеграция в Pipeline с ColumnTransformer требует адаптации
    ('clf', RandomForestClassifier(random_state=RANDOM_SEED))
])


In [5]:
# Преобразуем X_train через preprocessor отдельно для SMOTE
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# Применим SMOTE к подготовленным данным
sm = SMOTE(random_state=RANDOM_SEED)
X_res, y_res = sm.fit_resample(X_train_prep, y_train)

# Теперь подбираем модель на resampled данных
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rfc = RandomForestClassifier(random_state=RANDOM_SEED)
grid = GridSearchCV(rfc, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_res, y_res)
grid.best_params_, grid.best_score_


({'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200},
 0.8664310072604788)

In [6]:
best = grid.best_estimator_
y_pred = best.predict(X_test_prep)
y_proba = best.predict_proba(X_test_prep)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))


Accuracy: 0.801
Precision: 0.5603996366939146
Recall: 0.46495855312735496
F1: 0.5082372322899505
ROC AUC: 0.7552831599573687

Confusion matrix:
 [[4189  484]
 [ 710  617]]

Classification report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      4673
           1       0.56      0.46      0.51      1327

    accuracy                           0.80      6000
   macro avg       0.71      0.68      0.69      6000
weighted avg       0.79      0.80      0.79      6000



In [15]:
# Загружаем данные
df_reg = pd.read_csv('AirQuality.csv', sep=';', decimal=',')

df_reg = df_reg.loc[:, ~df_reg.columns.str.contains('^Unnamed')]

numeric_cols = df_reg.select_dtypes(include=[np.number]).columns.tolist()
mask = (df_reg[numeric_cols] != -200).all(axis=1)
df_reg = df_reg[mask]

# Удаляем строки с пропусками в любом столбце
df_reg = df_reg.dropna()

# Остальной код остается как был:
df_reg.head()
df_reg.info()
df_reg.describe()

=== AirQuality Dataset Check ===
Shape: (9471, 17)

First few rows:
         Date      Time  CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  \
0  10/03/2004  18.00.00     2.6       1360.0     150.0      11.9   
1  10/03/2004  19.00.00     2.0       1292.0     112.0       9.4   
2  10/03/2004  20.00.00     2.2       1402.0      88.0       9.0   
3  10/03/2004  21.00.00     2.2       1376.0      80.0       9.2   
4  10/03/2004  22.00.00     1.6       1272.0      51.0       6.5   

   PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  \
0         1046.0    166.0        1056.0    113.0        1692.0       1268.0   
1          955.0    103.0        1174.0     92.0        1559.0        972.0   
2          939.0    131.0        1140.0    114.0        1555.0       1074.0   
3          948.0    172.0        1092.0    122.0        1584.0       1203.0   
4          836.0    131.0        1205.0    116.0        1490.0       1110.0   

      T    RH      AH  Unnamed: 15  Unnamed: 16 

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0,7674.0
mean,2.15275,1054.2227,-150.867735,1.233385,897.865911,230.809747,782.760099,97.746286,1374.024498,990.038963,8.405786,38.349909,-7.653565
std,1.453252,341.261735,152.553382,43.276414,348.72825,233.316418,326.25123,85.341664,478.335306,470.166864,45.022976,53.334406,40.777732
min,0.1,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,1.1,927.0,-200.0,4.1,716.0,90.0,628.0,73.0,1157.0,704.0,10.3,33.5,0.65
50%,1.8,1062.0,-200.0,8.1,903.0,177.0,782.0,107.0,1425.0,968.0,16.3,48.1,0.9406
75%,2.9,1235.0,-200.0,14.0,1116.75,326.0,949.0,141.0,1659.0,1287.0,23.5,61.8,1.2352
max,11.9,2040.0,1189.0,63.7,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,44.6,88.7,2.1806


In [8]:
# Создадим временные признаки если есть Date/Time
if 'Date' in df_reg.columns and 'Time' in df_reg.columns:
    dt = pd.to_datetime(df_reg['Date'] + ' ' + df_reg['Time'], dayfirst=True, errors='coerce')
    df_reg['hour'] = dt.dt.hour
    df_reg['weekday'] = dt.dt.weekday
    df_reg['month'] = dt.dt.month

# Лаги для CO(GT) и скользящие средние (удаляем первые строки с NaN после лагов)
df_reg = df_reg.sort_values(by=['Date','Time']) if ('Date' in df_reg.columns) else df_reg
df_reg['CO_lag1'] = df_reg['CO(GT)'].shift(1)
df_reg['CO_roll3'] = df_reg['CO(GT)'].rolling(window=3, min_periods=1).mean()
df_reg = df_reg[df_reg['CO_lag1'].notna()]  # удалим первые NaN


  dt = pd.to_datetime(df_reg['Date'] + ' ' + df_reg['Time'], dayfirst=True, errors='coerce')


In [11]:
X = df_reg.drop(columns=['CO(GT)'])
y = df_reg['CO(GT)']

# разделим
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_train.select_dtypes(include='object').columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


In [12]:
# Пример: RandomForest и GradientBoosting
rf = RandomForestRegressor(random_state=RANDOM_SEED)
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)

pipe_rf = Pipeline([('pre', preprocessor), ('model', rf)])
pipe_gb = Pipeline([('pre', preprocessor), ('model', gb)])
pipe_lr = Pipeline([('pre', preprocessor), ('model', LinearRegression())])

param_grid_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10]
}
param_grid_gb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1]
}

gs_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='r2', n_jobs=-1)
gs_gb = GridSearchCV(pipe_gb, param_grid_gb, cv=5, scoring='r2', n_jobs=-1)

gs_rf.fit(X_train, y_train)
gs_gb.fit(X_train, y_train)

print("RF best:", gs_rf.best_params_, gs_rf.best_score_)
print("GB best:", gs_gb.best_params_, gs_gb.best_score_)




RF best: {'model__max_depth': None, 'model__n_estimators': 200} 0.9529603036506735
GB best: {'model__learning_rate': 0.1, 'model__n_estimators': 200} 0.9565477866678929


In [13]:
best_rf = gs_rf.best_estimator_
best_gb = gs_gb.best_estimator_
best_lr = pipe_lr.fit(X_train, y_train)

for name, model in [('RF', best_rf), ('GB', best_gb), ('LR', best_lr)]:
    pred = model.predict(X_test)
    print("Model:", name)
    print("MAE:", mean_absolute_error(y_test, pred))
    print("RMSE:", mean_squared_error(y_test, pred, squared=False))
    print("R2:", r2_score(y_test, pred))
    print('---')




Model: RF
MAE: 0.21117524429967427
RMSE: 0.34646753212145603
R2: 0.9444872783776458
---
Model: GB
MAE: 0.21171938975559773
RMSE: 0.31830767683353955
R2: 0.9531443830517367
---
Model: LR
MAE: 7920369.512080577
RMSE: 179159150.2857628
R2: -1.4843811232011816e+16
---




In [14]:
# Предполагаем, что у тебя были старые результаты в словарях results_reg и results_cls
# Здесь пример того, как собрать таблицу сравнений:

old_reg = {
 'Linear': {'MAE': 0.2269, 'MSE': 0.1262, 'RMSE': 0.3552, 'R2': 0.9394},
 'KNN': {'MAE': 0.2254, 'MSE': 0.1231, 'RMSE': 0.3508, 'R2': 0.9409},
 'RandomForest': {'MAE': 0.2163, 'MSE': 0.1140, 'RMSE': 0.3376, 'R2': 0.9453}
}

# допустим, new_reg собираем из текущих моделей:
new_reg = {}
for name, model in [('RF', best_rf), ('GB', best_gb), ('LR', best_lr)]:
    pred = model.predict(X_test)
    new_reg[name] = {
        'MAE': mean_absolute_error(y_test, pred),
        'RMSE': mean_squared_error(y_test, pred, squared=False),
        'R2': r2_score(y_test, pred)
    }

pd.DataFrame(old_reg).T.round(4), pd.DataFrame(new_reg).T.round(4)




(                 MAE     MSE    RMSE      R2
 Linear        0.2269  0.1262  0.3552  0.9394
 KNN           0.2254  0.1231  0.3508  0.9409
 RandomForest  0.2163  0.1140  0.3376  0.9453,
              MAE          RMSE            R2
 RF  2.112000e-01  3.465000e-01  9.445000e-01
 GB  2.117000e-01  3.183000e-01  9.531000e-01
 LR  7.920370e+06  1.791592e+08 -1.484381e+16)