In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
df_cls = pd.read_csv('UCI_Credit_Card.csv')
df_cls.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [3]:
# Обработка данных
X_cls = df_cls.drop('default.payment.next.month', axis=1)
y_cls = df_cls['default.payment.next.month']

categorical = X_cls.select_dtypes('object').columns.tolist()
numerical = X_cls.select_dtypes(include=np.number).columns.tolist()

preprocess_cls = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])


In [4]:
models_cls = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(5),
    'RandomForest': RandomForestClassifier()
}

results_cls = {}
for name, model in models_cls.items():
    pipe = Pipeline([('prep', preprocess_cls), ('model', model)])
    X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    results_cls[name] = {
        'Accuracy': accuracy_score(y_test, pred),
        'Precision': precision_score(y_test, pred),
        'Recall': recall_score(y_test, pred),
        'F1': f1_score(y_test, pred)
    }

results_cls


{'LogisticRegression': {'Accuracy': 0.8098333333333333,
  'Precision': 0.6928251121076233,
  'Recall': 0.23533891850723535,
  'F1': 0.35133598635588403},
 'KNN': {'Accuracy': 0.7953333333333333,
  'Precision': 0.5508982035928144,
  'Recall': 0.3503427265803503,
  'F1': 0.42830540037243947},
 'RandomForest': {'Accuracy': 0.8171666666666667,
  'Precision': 0.647945205479452,
  'Recall': 0.3602437166793602,
  'F1': 0.46304454233969655}}

In [5]:
# --- РЕГРЕССИЯ ---

In [6]:
df_reg = pd.read_csv('AirQuality.csv', sep=';', decimal=',')
df_reg = df_reg.loc[:, ~df_reg.columns.str.contains('^Unnamed')]
df_reg = df_reg[df_reg['CO(GT)'] != -200]
df_reg = df_reg[df_reg['CO(GT)'].notna()]
df_reg.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


In [7]:
X_reg = df_reg.drop('CO(GT)', axis=1)
y_reg = df_reg['CO(GT)']

cat_reg = X_reg.select_dtypes('object').columns.tolist()
num_reg = X_reg.select_dtypes(include=np.number).columns.tolist()

from sklearn.impute import SimpleImputer

preprocess_reg = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), num_reg),

    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_reg)
])



In [8]:
models_reg = {
    'Linear': LinearRegression(),
    'KNN': KNeighborsRegressor(5),
    'RandomForest': RandomForestRegressor()
}

results_reg = {}
for name, model in models_reg.items():
    pipe = Pipeline([('prep', preprocess_reg), ('model', model)])
    X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    results_reg[name] = {
        'MAE': mean_absolute_error(y_test, pred),
        'MSE': mean_squared_error(y_test, pred),
        'RMSE': mean_squared_error(y_test, pred)**0.5,
        'R2': r2_score(y_test, pred)
    }

results_reg


{'Linear': {'MAE': 0.22693747618586135,
  'MSE': 0.1261916757467557,
  'RMSE': 0.35523467700487193,
  'R2': 0.939414737396136},
 'KNN': {'MAE': 0.22536807817589577,
  'MSE': 0.12306058631921822,
  'RMSE': 0.350799923488045,
  'R2': 0.9409179892872055},
 'RandomForest': {'MAE': 0.2161693811074919,
  'MSE': 0.11386885602605865,
  'RMSE': 0.33744459697268625,
  'R2': 0.9453309855510204}}

In [9]:
# Ячейка 1: импорты и общие настройки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report,
                             mean_absolute_error, mean_squared_error, r2_score)

# модели
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from imblearn.over_sampling import SMOTE

# полезные
import seaborn as sns
sns.set_style('whitegrid')
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [10]:
df = pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [11]:
# Ячейка 3: базовые фичи и очистка
df = df.copy()

# переименуем для удобства (если нужно)
# df.rename(columns={'default.payment.next.month':'target'}, inplace=True)

# Пример новых признаков:
# Кол-во просрочек: используем PAY_0..PAY_6 (или PAY_1..PAY_6 в вашем датасете)
pay_cols = [c for c in df.columns if c.startswith('PAY')]
df['n_late'] = (df[pay_cols] > 0).sum(axis=1)
df['max_late'] = df[pay_cols].max(axis=1)

# Отношение сумм к кредитному лимиту (если есть LIMIT_BAL)
if 'LIMIT_BAL' in df.columns:
    bill_cols = [c for c in df.columns if c.startswith('BILL_AMT')]
    pay_amt_cols = [c for c in df.columns if c.startswith('PAY_AMT')]
    df['bill_sum'] = df[bill_cols].sum(axis=1)
    df['pay_sum'] = df[pay_amt_cols].sum(axis=1)
    df['pay_rate'] = df['pay_sum'] / (df['bill_sum'] + 1e-6)
    df['bill_limit_ratio'] = df['bill_sum'] / (df['LIMIT_BAL'] + 1e-6)

# Обработаем категориальные как object, но в этом датасете они часто numeric-coded
cat_cols = ['SEX','EDUCATION','MARRIAGE']
cat_cols = [c for c in cat_cols if c in df.columns]

# Целевая
y = df['default.payment.next.month']
X = df.drop(['default.payment.next.month', 'ID'] if 'ID' in df.columns else ['default.payment.next.month'], axis=1)


In [12]:
from sklearn.model_selection import StratifiedKFold

# разделим данные один раз
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

# признаки
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
# убираем cat_cols из численных, если они кодируются как числа
num_cols = [c for c in num_cols if c not in cat_cols]
cat_cols = [c for c in cat_cols if c in X_train.columns]

# preprocess
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# модель + pipeline
pipe = Pipeline([
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=RANDOM_SEED)),   # Note: SMOTE должен применяться только к X_train; интеграция в Pipeline с ColumnTransformer требует адаптации
    ('clf', RandomForestClassifier(random_state=RANDOM_SEED))
])


In [13]:
# Преобразуем X_train через preprocessor отдельно для SMOTE
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# Применим SMOTE к подготовленным данным
sm = SMOTE(random_state=RANDOM_SEED)
X_res, y_res = sm.fit_resample(X_train_prep, y_train)

# Теперь подбираем модель на resampled данных
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rfc = RandomForestClassifier(random_state=RANDOM_SEED)
grid = GridSearchCV(rfc, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_res, y_res)
grid.best_params_, grid.best_score_


({'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200},
 0.8664310072604788)

In [14]:
best = grid.best_estimator_
y_pred = best.predict(X_test_prep)
y_proba = best.predict_proba(X_test_prep)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))


Accuracy: 0.801
Precision: 0.5603996366939146
Recall: 0.46495855312735496
F1: 0.5082372322899505
ROC AUC: 0.7552831599573687

Confusion matrix:
 [[4189  484]
 [ 710  617]]

Classification report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      4673
           1       0.56      0.46      0.51      1327

    accuracy                           0.80      6000
   macro avg       0.71      0.68      0.69      6000
weighted avg       0.79      0.80      0.79      6000



In [15]:
# Загружаем данные
df_reg = pd.read_csv('AirQuality.csv', sep=';', decimal=',')

df_reg = df_reg.loc[:, ~df_reg.columns.str.contains('^Unnamed')]

numeric_cols = df_reg.select_dtypes(include=[np.number]).columns.tolist()
mask = (df_reg[numeric_cols] != -200).all(axis=1)
df_reg = df_reg[mask]

# Удаляем строки с пропусками в любом столбце
df_reg = df_reg.dropna()

# Остальной код остается как был:
df_reg.head()
df_reg.info()
df_reg.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 827 entries, 0 to 1230
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           827 non-null    object 
 1   Time           827 non-null    object 
 2   CO(GT)         827 non-null    float64
 3   PT08.S1(CO)    827 non-null    float64
 4   NMHC(GT)       827 non-null    float64
 5   C6H6(GT)       827 non-null    float64
 6   PT08.S2(NMHC)  827 non-null    float64
 7   NOx(GT)        827 non-null    float64
 8   PT08.S3(NOx)   827 non-null    float64
 9   NO2(GT)        827 non-null    float64
 10  PT08.S4(NO2)   827 non-null    float64
 11  PT08.S5(O3)    827 non-null    float64
 12  T              827 non-null    float64
 13  RH             827 non-null    float64
 14  AH             827 non-null    float64
dtypes: float64(13), object(2)
memory usage: 103.4+ KB


Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0,827.0
mean,2.353567,1207.879081,231.025393,10.7711,966.116082,143.501814,963.297461,100.259976,1600.620314,1045.812576,15.601451,49.050181,0.831853
std,1.409496,241.816997,208.461912,7.418134,266.424557,81.829717,265.914168,31.493823,302.291793,400.134662,4.825304,15.266746,0.178506
min,0.3,753.0,7.0,0.5,448.0,12.0,461.0,19.0,955.0,263.0,6.3,14.9,0.4023
25%,1.3,1017.0,77.0,4.8,754.0,81.0,769.0,78.5,1369.5,760.0,11.9,36.7,0.71895
50%,2.0,1172.0,157.0,9.1,944.0,128.0,920.0,99.0,1556.0,1009.0,15.0,49.6,0.8177
75%,3.1,1380.0,318.5,14.8,1142.5,187.0,1131.0,122.0,1783.5,1320.0,18.3,60.55,0.9275
max,8.1,2040.0,1189.0,39.2,1754.0,478.0,1935.0,196.0,2679.0,2359.0,30.0,83.2,1.4852


In [16]:
if 'Date' in df_reg.columns and 'Time' in df_reg.columns:
    dt = pd.to_datetime(df_reg['Date'] + ' ' + df_reg['Time'], 
                        format='%d/%m/%Y %H.%M.%S',  # ← ВОТ ЭТО ДОБАВЬ
                        dayfirst=True, 
                        errors='coerce')
    df_reg['hour'] = dt.dt.hour
    df_reg['weekday'] = dt.dt.weekday
    df_reg['month'] = dt.dt.month
    
    # ВАЖНО: Удаляем Date и Time чтобы они не попали в категориальные признаки
    df_reg = df_reg.drop(columns=['Date', 'Time'])

# Лаги для CO(GT) и скользящие средние (удаляем первые строки с NaN после лагов)
df_reg = df_reg.sort_values(by=['Date','Time']) if ('Date' in df_reg.columns) else df_reg
df_reg['CO_lag1'] = df_reg['CO(GT)'].shift(1)
df_reg['CO_roll3'] = df_reg['CO(GT)'].rolling(window=3, min_periods=1).mean()
df_reg = df_reg[df_reg['CO_lag1'].notna()]  # удалим первые NaN

df.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month,n_late,max_late,bill_sum,pay_sum,pay_rate,bill_limit_ratio
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,1,3,689.0,7704.0,689.0,0.089434,0.3852
1,2,120000.0,2,2,2,26,-1,2,0,0,...,1000.0,0.0,2000.0,1,6,2000.0,17077.0,5000.0,0.292791,0.142308
2,3,90000.0,2,2,2,34,0,0,0,0,...,1000.0,1000.0,5000.0,0,6,5000.0,101653.0,11018.0,0.108388,1.129478
3,4,50000.0,2,2,1,37,0,0,0,0,...,1100.0,1069.0,1000.0,0,6,2019.0,231334.0,8388.0,0.036259,4.62668
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,9000.0,689.0,679.0,0,6,36681.0,109339.0,59049.0,0.540054,2.18678


In [17]:
X = df_reg.drop(columns=['CO(GT)'])
y = df_reg['CO(GT)']

# разделим
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_train.select_dtypes(include='object').columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


In [18]:
# Пример: RandomForest и GradientBoosting
rf = RandomForestRegressor(random_state=RANDOM_SEED)
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)

pipe_rf = Pipeline([('pre', preprocessor), ('model', rf)])
pipe_gb = Pipeline([('pre', preprocessor), ('model', gb)])
pipe_lr = Pipeline([('pre', preprocessor), ('model', LinearRegression())])

param_grid_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10]
}
param_grid_gb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1]
}

gs_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='r2', n_jobs=-1)
gs_gb = GridSearchCV(pipe_gb, param_grid_gb, cv=5, scoring='r2', n_jobs=-1)

gs_rf.fit(X_train, y_train)
gs_gb.fit(X_train, y_train)

print("RF best:", gs_rf.best_params_, gs_rf.best_score_)
print("GB best:", gs_gb.best_params_, gs_gb.best_score_)


RF best: {'model__max_depth': 10, 'model__n_estimators': 100} 0.9708520888276091
GB best: {'model__learning_rate': 0.1, 'model__n_estimators': 200} 0.9780243001822095


In [19]:
best_rf = gs_rf.best_estimator_
best_gb = gs_gb.best_estimator_
best_lr = pipe_lr.fit(X_train, y_train)

for name, model in [('RF', best_rf), ('GB', best_gb), ('LR', best_lr)]:
    pred = model.predict(X_test)
    print("Model:", name)
    print("MAE:", mean_absolute_error(y_test, pred))
    print("RMSE:", mean_squared_error(y_test, pred, squared=False))
    print("R2:", r2_score(y_test, pred))
    print('---')


Model: RF
MAE: 0.1526278201419349
RMSE: 0.21042909631635665
R2: 0.9792012726061632
---
Model: GB
MAE: 0.1251622238539264
RMSE: 0.17263927662301023
R2: 0.9860007619783069
---
Model: LR
MAE: 0.1411448736076749
RMSE: 0.1824414260270462
R2: 0.9843659283187381
---




In [20]:
# Предполагаем, что у тебя были старые результаты в словарях results_reg и results_cls
# Здесь пример того, как собрать таблицу сравнений:

old_reg = {
 'Linear': {'MAE': 0.2269, 'MSE': 0.1262, 'RMSE': 0.3552, 'R2': 0.9394},
 'KNN': {'MAE': 0.2254, 'MSE': 0.1231, 'RMSE': 0.3508, 'R2': 0.9409},
 'RandomForest': {'MAE': 0.2163, 'MSE': 0.1140, 'RMSE': 0.3376, 'R2': 0.9453}
}

# допустим, new_reg собираем из текущих моделей:
new_reg = {}
for name, model in [('RF', best_rf), ('GB', best_gb), ('LR', best_lr)]:
    pred = model.predict(X_test)
    new_reg[name] = {
        'MAE': mean_absolute_error(y_test, pred),
        'RMSE': mean_squared_error(y_test, pred, squared=False),
        'R2': r2_score(y_test, pred)
    }

pd.DataFrame(old_reg).T.round(4), pd.DataFrame(new_reg).T.round(4)




(                 MAE     MSE    RMSE      R2
 Linear        0.2269  0.1262  0.3552  0.9394
 KNN           0.2254  0.1231  0.3508  0.9409
 RandomForest  0.2163  0.1140  0.3376  0.9453,
        MAE    RMSE      R2
 RF  0.1526  0.2104  0.9792
 GB  0.1252  0.1726  0.9860
 LR  0.1411  0.1824  0.9844)

In [21]:
# Ячейка 1: импорты и утилиты
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score, confusion_matrix
)
import time
from pprint import pprint

# helper: RMSE
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)


In [22]:
# Ячейка 2: реализации на numpy

class LinearRegressionScratch:
    """OLS via normal equation; optionally gradient descent."""
    def __init__(self, fit_intercept=True, reg=0.0):
        self.fit_intercept = fit_intercept
        self.reg = reg
        self.coef_ = None
        self.intercept_ = None

    def _add_intercept(self, X):
        if self.fit_intercept:
            return np.hstack([np.ones((X.shape[0],1)), X])
        return X

    def fit_normal(self, X, y):
        X0 = self._add_intercept(X)
        # Regularization: add reg to diagonal except bias
        n_features = X0.shape[1]
        A = X0.T.dot(X0)
        if self.reg>0:
            A = A + self.reg * np.eye(n_features)
            if self.fit_intercept:
                A[0,0] -= self.reg  # don't regularize intercept
        w = np.linalg.pinv(A).dot(X0.T).dot(y)
        if self.fit_intercept:
            self.intercept_ = w[0]
            self.coef_ = w[1:]
        else:
            self.intercept_ = 0.0
            self.coef_ = w
        return self

    def predict(self, X):
        return X.dot(self.coef_) + self.intercept_


class LogisticRegressionScratch:
    """Binary logistic regression via gradient descent with optional L2."""
    def __init__(self, lr=0.1, n_iters=1000, fit_intercept=True, reg=0.0, verbose=False):
        self.lr = lr
        self.n_iters = n_iters
        self.fit_intercept = fit_intercept
        self.reg = reg
        self.coef_ = None
        self.intercept_ = 0.0
        self.verbose = verbose

    def _sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

    def _add_intercept(self, X):
        if self.fit_intercept:
            return np.hstack([np.ones((X.shape[0],1)), X])
        return X

    def fit(self, X, y):
        X0 = self._add_intercept(X)
        n_samples, n_features = X0.shape
        w = np.zeros(n_features)
        for i in range(self.n_iters):
            z = X0.dot(w)
            pred = self._sigmoid(z)
            # gradient
            grad = (1/n_samples) * X0.T.dot(pred - y) + self.reg * np.r_[0, w[1:]] / n_samples
            w -= self.lr * grad
            if self.verbose and i % (self.n_iters//5 + 1) == 0:
                loss = -np.mean(y*np.log(pred+1e-12) + (1-y)*np.log(1-pred+1e-12))
                print(f"iter {i}, loss {loss:.4f}")
        if self.fit_intercept:
            self.intercept_ = w[0]
            self.coef_ = w[1:]
        else:
            self.intercept_ = 0.0
            self.coef_ = w
        return self

    def predict_proba(self, X):
        z = X.dot(self.coef_) + self.intercept_
        p = self._sigmoid(z)
        return np.vstack([1-p, p]).T

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X)[:,1] >= threshold).astype(int)


class KNNClassifierScratch:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X.copy()
        self.y_train = y.copy()
        return self

    def predict(self, X):
        # Compute pairwise distances
        dists = np.sqrt(((X[:,None,:] - self.X_train[None,:,:])**2).sum(axis=2))
        idx = np.argsort(dists, axis=1)[:,:self.k]
        neigh = self.y_train[idx]
        # majority vote
        preds = (neigh.sum(axis=1) >= (self.k/2)).astype(int)
        return preds


class KNNRegressorScratch:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X.copy()
        self.y_train = y.copy()
        return self

    def predict(self, X):
        dists = np.sqrt(((X[:,None,:] - self.X_train[None,:,:])**2).sum(axis=2))
        idx = np.argsort(dists, axis=1)[:,:self.k]
        neigh = self.y_train[idx]
        preds = neigh.mean(axis=1)
        return preds


In [23]:
# Ячейка 3: подготовка данных для классификации (Credit Default)
df = pd.read_csv('UCI_Credit_Card.csv')

# Если имя целевой и ID отличается, поправь
target_col = 'default.payment.next.month'
id_col = 'ID' if 'ID' in df.columns else None

# Feature engineering (как в улучшённом бейзлайне)
df = df.copy()
pay_cols = [c for c in df.columns if c.startswith('PAY')]
bill_cols = [c for c in df.columns if c.startswith('BILL_AMT')]
pay_amt_cols = [c for c in df.columns if c.startswith('PAY_AMT')]

df['n_late'] = (df[pay_cols] > 0).sum(axis=1)
df['max_late'] = df[pay_cols].max(axis=1)
df['bill_sum'] = df[bill_cols].sum(axis=1)
df['pay_sum'] = df[pay_amt_cols].sum(axis=1)
df['pay_rate'] = df['pay_sum'] / (df['bill_sum'] + 1e-9)
if 'LIMIT_BAL' in df.columns:
    df['bill_limit_ratio'] = df['bill_sum'] / (df['LIMIT_BAL'] + 1e-9)

# Drop ID if present
if id_col:
    df = df.drop(columns=[id_col])

# Split X/y
y_cls = df[target_col].astype(int)
X_cls = df.drop(columns=[target_col])

# Define numeric & categorical
cat_cols = [c for c in ['SEX','EDUCATION','MARRIAGE'] if c in X_cls.columns]
num_cols = [c for c in X_cls.columns if c not in cat_cols]

# Preprocessor
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_cls = ColumnTransformer([('num', num_pipeline, num_cols), ('cat', cat_pipeline, cat_cols)])

# Train/test split
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_cls, y_cls, test_size=0.2, stratify=y_cls, random_state=42)

# Fit preprocessor and transform for models implemented from scratch
Xc_train_prep = preprocessor_cls.fit_transform(Xc_train)
Xc_test_prep = preprocessor_cls.transform(Xc_test)

print("Shapes (classification):", Xc_train_prep.shape, Xc_test_prep.shape, yc_train.shape, yc_test.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'default_of_credit_card_clients.csv'

In [None]:
# Ячейка 4: классификация — имплементации

results_impl_cls = {}

# 1) LogisticRegressionScratch
lr_s = LogisticRegressionScratch(lr=0.5, n_iters=2000, reg=1.0, verbose=False)
t0 = time.time()
lr_s.fit(Xc_train_prep, yc_train.values)
t1 = time.time()
y_pred_lr = lr_s.predict(Xc_test_prep)
y_proba_lr = lr_s.predict_proba(Xc_test_prep)[:,1]
results_impl_cls['LogisticScratch'] = {
    'fit_time': round(t1-t0,3),
    'Accuracy': accuracy_score(yc_test, y_pred_lr),
    'Precision': precision_score(yc_test, y_pred_lr, zero_division=0),
    'Recall': recall_score(yc_test, y_pred_lr, zero_division=0),
    'F1': f1_score(yc_test, y_pred_lr, zero_division=0),
    'ROC_AUC': roc_auc_score(yc_test, y_proba_lr)
}

# 2) KNN classifier scratch (k=5)
knn_s = KNNClassifierScratch(k=5)
knn_s.fit(Xc_train_prep, yc_train.values)
t0 = time.time()
y_pred_knn = knn_s.predict(Xc_test_prep)
t1 = time.time()
results_impl_cls['KNN_Scratch_k5'] = {
    'fit_time': round(t1-t0,3),
    'Accuracy': accuracy_score(yc_test, y_pred_knn),
    'Precision': precision_score(yc_test, y_pred_knn, zero_division=0),
    'Recall': recall_score(yc_test, y_pred_knn, zero_division=0),
    'F1': f1_score(yc_test, y_pred_knn, zero_division=0)
}

# Show results
pd.DataFrame(results_impl_cls).T.round(4)


In [None]:
# Ячейка 5: reference sklearn models for classification (already used in baseline)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

results_sklearn_cls = {}

# LogisticRegression (sklearn)
sk_lr = LogisticRegression(max_iter=2000)
sk_lr.fit(Xc_train_prep, yc_train)
pred = sk_lr.predict(Xc_test_prep)
proba = sk_lr.predict_proba(Xc_test_prep)[:,1]
results_sklearn_cls['Logistic_sklearn'] = {
    'Accuracy': accuracy_score(yc_test, pred),
    'Precision': precision_score(yc_test, pred, zero_division=0),
    'Recall': recall_score(yc_test, pred, zero_division=0),
    'F1': f1_score(yc_test, pred, zero_division=0),
    'ROC_AUC': roc_auc_score(yc_test, proba)
}

# RandomForest
rfc = RandomForestClassifier(n_estimators=200, random_state=42)
rfc.fit(Xc_train_prep, yc_train)
pred = rfc.predict(Xc_test_prep)
proba = rfc.predict_proba(Xc_test_prep)[:,1]
results_sklearn_cls['RandomForest'] = {
    'Accuracy': accuracy_score(yc_test, pred),
    'Precision': precision_score(yc_test, pred, zero_division=0),
    'Recall': recall_score(yc_test, pred, zero_division=0),
    'F1': f1_score(yc_test, pred, zero_division=0),
    'ROC_AUC': roc_auc_score(yc_test, proba)
}

# KNN sklearn
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xc_train_prep, yc_train)
pred = knn.predict(Xc_test_prep)
results_sklearn_cls['KNN_sklearn'] = {
    'Accuracy': accuracy_score(yc_test, pred),
    'Precision': precision_score(yc_test, pred, zero_division=0),
    'Recall': recall_score(yc_test, pred, zero_division=0),
    'F1': f1_score(yc_test, pred, zero_division=0)
}

pd.DataFrame(results_impl_cls).T.round(4), pd.DataFrame(results_sklearn_cls).T.round(4)


In [None]:
# Ячейка 6: подготовка AirQuality (регрессия)
df_reg = pd.read_csv('AirQuality.csv', sep=';', decimal=',')
df_reg = df_reg.loc[:, ~df_reg.columns.str.contains('^Unnamed')]
df_reg = df_reg[df_reg['CO(GT)'] != -200]
df_reg = df_reg[df_reg['CO(GT)'].notna()]

# create datetime features if present
if 'Date' in df_reg.columns and 'Time' in df_reg.columns:
    dt = pd.to_datetime(df_reg['Date'] + ' ' + df_reg['Time'], dayfirst=True, errors='coerce')
    df_reg['hour'] = dt.dt.hour
    df_reg['weekday'] = dt.dt.weekday
    df_reg['month'] = dt.dt.month

# add lags and rolling features
df_reg = df_reg.sort_values(by=['Date','Time']) if ('Date' in df_reg.columns and 'Time' in df_reg.columns) else df_reg
df_reg['CO_lag1'] = df_reg['CO(GT)'].shift(1)
df_reg['CO_roll3'] = df_reg['CO(GT)'].rolling(window=3, min_periods=1).mean()
df_reg = df_reg[df_reg['CO_lag1'].notna()].copy()

# drop columns that are non-numeric or not useful
target_col_reg = 'CO(GT)'
X_reg = df_reg.drop(columns=[target_col_reg])
y_reg = df_reg[target_col_reg].values

# specify numeric & categorical features
cat_cols_reg = X_reg.select_dtypes(include='object').columns.tolist()
num_cols_reg = X_reg.select_dtypes(include=np.number).columns.tolist()

# Preprocessor for regression
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_reg = ColumnTransformer([('num', num_pipeline, num_cols_reg), ('cat', cat_pipeline, cat_cols_reg)])

# train/test
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

Xr_train_prep = preprocessor_reg.fit_transform(Xr_train)
Xr_test_prep = preprocessor_reg.transform(Xr_test)

print("Shapes (regression):", Xr_train_prep.shape, Xr_test_prep.shape)


In [None]:
# Ячейка 7: регрессия — имплементации

results_impl_reg = {}

# 1) LinearRegressionScratch (normal eq)
lr_reg = LinearRegressionScratch(reg=1e-6)
t0 = time.time()
lr_reg.fit_normal(Xr_train_prep, yr_train)
t1 = time.time()
pred_lr = lr_reg.predict(Xr_test_prep)
results_impl_reg['LinearScratch'] = {
    'fit_time': round(t1-t0,3),
    'MAE': mean_absolute_error(yr_test, pred_lr),
    'RMSE': rmse(yr_test, pred_lr),
    'R2': r2_score(yr_test, pred_lr)
}

# 2) KNN regressor scratch (k=5)
knn_reg = KNNRegressorScratch(k=5)
t0 = time.time()
knn_reg.fit(Xr_train_prep, yr_train)
pred_knn = knn_reg.predict(Xr_test_prep)
t1 = time.time()
results_impl_reg['KNN_Scratch_k5'] = {
    'fit_time': round(t1-t0,3),
    'MAE': mean_absolute_error(yr_test, pred_knn),
    'RMSE': rmse(yr_test, pred_knn),
    'R2': r2_score(yr_test, pred_knn)
}

pd.DataFrame(results_impl_reg).T.round(6)


In [None]:
# Ячейка 8: sklearn reference for regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge

results_sklearn_reg = {}

# Ridge (stable linear)
ridge = Ridge(alpha=1.0)
ridge.fit(Xr_train_prep, yr_train)
pred = ridge.predict(Xr_test_prep)
results_sklearn_reg['Ridge'] = {'MAE': mean_absolute_error(yr_test,pred), 'RMSE': rmse(yr_test,pred), 'R2': r2_score(yr_test,pred)}

# RandomForest
rfr = RandomForestRegressor(n_estimators=200, random_state=42)
rfr.fit(Xr_train_prep, yr_train)
pred = rfr.predict(Xr_test_prep)
results_sklearn_reg['RandomForest'] = {'MAE': mean_absolute_error(yr_test,pred), 'RMSE': rmse(yr_test,pred), 'R2': r2_score(yr_test,pred)}

# GradientBoosting
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
gbr.fit(Xr_train_prep, yr_train)
pred = gbr.predict(Xr_test_prep)
results_sklearn_reg['GradientBoosting'] = {'MAE': mean_absolute_error(yr_test,pred), 'RMSE': rmse(yr_test,pred), 'R2': r2_score(yr_test,pred)}

pd.DataFrame(results_impl_reg).T.round(6), pd.DataFrame(results_sklearn_reg).T.round(6)


In [None]:
# Ячейка 9: сравнение и выводы
df_impl_cls = pd.DataFrame(results_impl_cls).T
df_sk_cls = pd.DataFrame(results_sklearn_cls).T
print("Classification - implemented models:")
display(df_impl_cls.round(4))
print("\nClassification - sklearn reference:")
display(df_sk_cls.round(4))

df_impl_reg = pd.DataFrame(results_impl_reg).T
df_sk_reg = pd.DataFrame(results_sklearn_reg).T
print("\nRegression - implemented models:")
display(df_impl_reg.round(6))
print("\nRegression - sklearn reference:")
display(df_sk_reg.round(6))

# Примеры выводов (шаблон)
print("\n--- Выводы (шаблон):")
print("1) Для классификации: сравни качества LogisticScratch и KNN_Scratch с sklearn моделями (Logistic_sklearn, RandomForest).")
print("2) Для регрессии: сравни LinearScratch и KNN_Scratch с Ridge/RandomForest/GBR; часто sklearn-ансамбли дают лучший R2, но имплементированные модели показывают, что базовая линейная модель достаточно сильна после FE.")
