# Лабораторная работа: Ансамбли моделей машинного обучения. Часть 1.

**Цель:** Изучение ансамблей моделей машинного обучения.

**Датасет:** Coffee Shop Sales  
**Задача:** Предсказать `Total_Bill` (итог чека) по данным о покупке

**Выполнил:** Андрест Владислав Дмитриевич  
**Группа:** ИУ5-65Б  
**Дата:** 2025

Импорт библиотек
-----

In [24]:
# 📦 Импорт библиотек
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

Загрузка данных
----

In [25]:
# Загрузка датасета
df = pd.read_csv("coffee_shop_sales.csv")
df.head()

Unnamed: 0,transaction_id,transaction_date,transaction_time,store_id,store_location,product_id,transaction_qty,unit_price,Total_Bill,product_category,product_type,product_detail,Size,Month Name,Day Name,Hour,Month,Day of Week
0,114301,01-06-2023,11:33:29,3,Astoria,45,1,3.0,3.0,Tea,Brewed herbal tea,Peppermint,Large,June,Thursday,11,6,3
1,115405,02-06-2023,11:18:24,3,Astoria,45,1,3.0,3.0,Tea,Brewed herbal tea,Peppermint,Large,June,Friday,11,6,4
2,115478,02-06-2023,12:02:45,3,Astoria,45,1,3.0,3.0,Tea,Brewed herbal tea,Peppermint,Large,June,Friday,12,6,4
3,116288,02-06-2023,19:39:47,3,Astoria,45,1,3.0,3.0,Tea,Brewed herbal tea,Peppermint,Large,June,Friday,19,6,4
4,116714,03-06-2023,12:24:57,3,Astoria,45,1,3.0,3.0,Tea,Brewed herbal tea,Peppermint,Large,June,Saturday,12,6,5


In [26]:
info = df.info()
print(info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   transaction_id    149116 non-null  int64  
 1   transaction_date  149116 non-null  object 
 2   transaction_time  149116 non-null  object 
 3   store_id          149116 non-null  int64  
 4   store_location    149116 non-null  object 
 5   product_id        149116 non-null  int64  
 6   transaction_qty   149116 non-null  int64  
 7   unit_price        149116 non-null  float64
 8   Total_Bill        149116 non-null  float64
 9   product_category  149116 non-null  object 
 10  product_type      149116 non-null  object 
 11  product_detail    149116 non-null  object 
 12  Size              149116 non-null  object 
 13  Month Name        149116 non-null  object 
 14  Day Name          149116 non-null  object 
 15  Hour              149116 non-null  int64  
 16  Month             14

Очистка данных
---

In [27]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# 🎯 Целевая переменная
target = "Total_Bill"

# 🧹 Подготовка данных
df_model = df.drop(columns=["transaction_id", "transaction_date", "transaction_time", target])

# Кодируем категориальные признаки
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

df.head()

Unnamed: 0,transaction_id,transaction_date,transaction_time,store_id,store_location,product_id,transaction_qty,unit_price,Total_Bill,product_category,product_type,product_detail,Size,Month Name,Day Name,Hour,Month,Day of Week
0,114301,5,11609,3,0,45,1,3.0,3.0,8,6,37,0,3,4,11,6,3
1,115405,11,11126,3,0,45,1,3.0,3.0,8,6,37,0,3,0,11,6,4
2,115478,11,12451,3,0,45,1,3.0,3.0,8,6,37,0,3,0,12,6,4
3,116288,11,25184,3,0,45,1,3.0,3.0,8,6,37,0,3,0,19,6,4
4,116714,17,13090,3,0,45,1,3.0,3.0,8,6,37,0,3,2,12,6,5


Агрегация данных
----

In [28]:
categorical_cols = df_model.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])

# Добавим целевой столбец обратно
df_model[target] = df[target]

# ✂️ Разделение данных
X = df_model.drop(columns=target)
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
info = df.info()
print(info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   transaction_id    149116 non-null  int64  
 1   transaction_date  149116 non-null  int64  
 2   transaction_time  149116 non-null  int64  
 3   store_id          149116 non-null  int64  
 4   store_location    149116 non-null  int64  
 5   product_id        149116 non-null  int64  
 6   transaction_qty   149116 non-null  int64  
 7   unit_price        149116 non-null  float64
 8   Total_Bill        149116 non-null  float64
 9   product_category  149116 non-null  int64  
 10  product_type      149116 non-null  int64  
 11  product_detail    149116 non-null  int64  
 12  Size              149116 non-null  int64  
 13  Month Name        149116 non-null  int64  
 14  Day Name          149116 non-null  int64  
 15  Hour              149116 non-null  int64  
 16  Month             14

In [30]:
# 🤖 Обучение моделей
models = {
    "Bagging": BaggingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# 📊 Оценка моделей
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"RMSE": rmse, "R²": r2}

# 📈 Вывод результатов
for model, metrics in results.items():
    print(f"{model}:")
    print(f"  RMSE: {metrics['RMSE']:.4f}")
    print(f"  R²: {metrics['R²']:.5f}")
    print("-" * 30)

Bagging:
  RMSE: 0.0009
  R²: 0.99996
------------------------------
Random Forest:
  RMSE: 0.0024
  R²: 0.99989
------------------------------
AdaBoost:
  RMSE: 1.5701
  R²: 0.92847
------------------------------
Gradient Boosting:
  RMSE: 0.0023
  R²: 0.99989
------------------------------
