In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # Мы не нуждаемся в обучении для этой трансформации

    def transform(self, X):
        # Извлекаем временные признаки
        X['year'] = X['datetime'].dt.year
        X['month'] = X['datetime'].dt.month
        X['season'] = X['datetime'].dt.quarter
        X['day'] = X['datetime'].dt.day
        X['hour'] = X['datetime'].dt.hour
        X['dayofyear'] = X['datetime'].dt.day_of_year
        X['dayofweek'] = X['datetime'].dt.day_of_week
        X['is_weekend'] = X['dayofweek'].isin([5, 6]).astype(int)

        # Убираем колонку 'datetime' после обработки
        X = X.drop(columns=['datetime'])

        return X

In [2]:
class TrigonometricFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Добавляем тригонометрические признаки для временных признаков
        X['sin_month'] = X['month'].apply(math.sin)
        X['cos_month'] = X['month'].apply(math.cos)

        X['sin_hour'] = X['hour'].apply(math.sin)
        X['cos_hour'] = X['hour'].apply(math.cos)

        X['hour_sin'] = (X['hour'] / 23 * 2 * np.pi).apply(math.sin)
        X['hour_cos'] = (X['hour'] / 23 * 2 * np.pi).apply(math.cos)

        X['month_sin'] = ((X['month'] - 1) / 11 * 2 * np.pi).apply(math.sin)
        X['month_cos'] = ((X['month'] - 1) / 11 * 2 * np.pi).apply(math.cos)

        # Преобразуем дату в индекс, если нужно для дальнейшего использования
        day = 24
        year = 365.2425 * day

        X['day_sin'] = (X['hour'] * 2 * np.pi / day).apply(math.sin)
        X['day_cos'] = (X['hour'] * 2 * np.pi / day).apply(math.cos)

        X['year_sin'] = (X['hour'] * 2 * np.pi / year).apply(math.sin)
        X['year_cos'] = (X['hour'] * 2 * np.pi / year).apply(math.cos)

        return X

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline_preprocessing = Pipeline([
    ('time_features', TimeFeaturesExtractor()), # призанки даты и времени
    ('trigonometric_features', TrigonometricFeaturesExtractor()), # Тригонометрические признаки
    # ('dummies', DummiesCreator()), # кодирование
    # ('scaler', StandardScaler())  # Масштабирование данных
])

In [7]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [8]:
import pandas as pd
import opendatasets as od

od.download("https://www.kaggle.com/competitions/electricity-consumption")
df = pd.read_csv("./electricity-consumption/train.csv")

# просто удаляем пропуски
df = df[~df['total'].isna()]
df['total'] = df['total'].astype('int64')

# Преобразуем строковый столбец 'datetime' в формат datetime
df['datetime'] = pd.to_datetime(df['datetime'], format="%d.%m.%Y %H:%M:%S")
# Разделяем данные на train и test, извлекая год прямо из 'datetime'
train = df[df['datetime'].dt.year != 2008]
test = df[df['datetime'].dt.year == 2008]

# Удалим ненужные колонки (например, целевую переменную) из обучающих данных
X_train = train.drop(columns=['total'])
y_train = train['total']

# Применим пайплайн и обучим модель
pipeline_preprocessing.fit(X_train, y_train)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Extracting archive ./electricity-consumption/electricity-consumption.zip to ./electricity-consumption


In [9]:
# Удалим ненужные колонки из тестовых данных
X_test = test.drop(columns=['total'])
y_test = test['total']

# Применим пайплайн к тестовым данным (переход от fit к transform)
X_train_transformed = pipeline_preprocessing.transform(X_train)
X_test_transformed = pipeline_preprocessing.transform(X_test)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd

# Обучаем модель
model = LinearRegression()
model.fit(X_train_transformed, y_train)

# Прогнозируем на тестовых данных
y_pred = model.predict(X_test_transformed)

# Оценка модели
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error на тестовых данных: {mse}')
print(f'Mean Absolute Error на тестовых данных: {mae}')
print(f'r2_score на тестовых данных: {r2}')

Mean Squared Error на тестовых данных: 325398905.603892
Mean Absolute Error на тестовых данных: 14287.590463849794
r2_score на тестовых данных: 0.8202045832756062


In [12]:
# Обучаем модель на ВСЕМ НАБОРЕ данных

df = pd.read_csv("./electricity-consumption/train.csv")

# просто удаляем пропуски
df = df[~df['total'].isna()]
df['total'] = df['total'].astype('int64')

# Преобразуем строковый столбец 'datetime' в формат datetime
df['datetime'] = pd.to_datetime(df['datetime'], format="%d.%m.%Y %H:%M:%S")

# Удалим ненужные колонки из тестовых данных
X = df.drop(columns=['total'])
y = df['total']

# Применим пайплайн к тестовым данным (переход от fit к transform)
X_transformed = pipeline_preprocessing.transform(X)

model = LinearRegression()
model.fit(X_transformed, y)

In [13]:
test_final = pd.read_csv("./electricity-consumption/sample.csv")

In [14]:
# Преобразуем строковый столбец 'datetime' в формат datetime
test_X = test_final.copy()

test_X['datetime'] = pd.to_datetime(test_X['datetime'], format="%d.%m.%Y %H:%M:%S")

test_X = pipeline_preprocessing.transform(test_final.drop(columns=['total']))

test_X.shape

AttributeError: Can only use .dt accessor with datetimelike values