# Создание признаков

**_______________________________________________________________________________________________________________________________**

**Загрузим всё необходимое: зависимости, сущности и модули:**

In [None]:
%run ../src/parsing/preset.py

___________________________________________________________________________________________________________________________

**Настройка текущий тетрадки (notebook):**

In [None]:
# Установка количества отображаемых строк:
pd.set_option("display.max_rows", 75)

# Установка количества отображаемых столбцов:
pd.set_option("display.max_columns", 25)

# Установка ограничения на количество отображаемых символов записи:
pd.set_option("display.max_colwidth", 45)

___________________________________________________________________________________________________________________________

**Имортируемые данные для дальнейшей работы:**

In [None]:
df = pd.read_parquet("../data/prep_df.gzip")

___________________________________________________________________________________________________________________________

**Посмотрим на скаченные данные:**

In [None]:
print(f"Количество строк dataframe-а: {df.shape[0]}\nКоличество столбцов dataframe-а: {df.shape[1]}")

In [None]:
df.info()

In [None]:
df.head(3)

___________________________________________________________________________________________________________________________

**Создание time-step (которые можно получить непосредственно из даты и времени) признаков:**

**Признак `time_dummie` - номер временного шага:**

In [None]:
def create_time_dummie_feature(feature: pd.DataFrame) -> pd.Series:
    """
    
    """
    
    temp_df = pd.DataFrame()

    for subj in df["subject_name"].unique():
        subj_df = df[df["subject_name"] == subj].copy()
        subj_df["time_dummie"] = range(1, subj_df.shape[0] + 1)
        temp_df = temp_df.append(subj_df)
        
    temp_df.sort_values(by=["datetime", "subject_name", "time_dummie"], inplace=True)
    
    return temp_df["time_dummie"]

In [None]:
df["time_dummie"] = create_time_dummie_feature(df)

In [None]:
df["year"] = df["datetime"].apply(get_year)

In [None]:
df["month"] = df["datetime"].apply(get_month)

In [None]:
df["day_of_month"] = df["datetime"].apply(get_day_month)

In [None]:
df["hour"] = df["datetime"].apply(get_hour)

In [None]:
df["day_of_week"] = df["datetime"].apply(get_day_week)

In [None]:
df["day_of_year"] = df["datetime"].apply(get_day_year)

In [None]:
df["week_of_year"] = df["datetime"].apply(get_week_year)

In [None]:
df["quarter"] = df["datetime"].apply(get_quarter)

In [None]:
df["holiday"] = df["datetime"].apply(is_holiday)

**Создание lag (отложенных по времени) признаков:**

In [None]:
def create_lag_feature(feature: pd.DataFrame, step_count: int) -> pd.Series:
    """
    
    """
    
    temp_df = pd.DataFrame()

    for subj in df["subject_name"].unique():
        subj_df = df[df["subject_name"] == subj].copy()
        subj_df["actual_consumption"] = subj_df["actual_consumption"].shift(step_count)
        temp_df = temp_df.append(subj_df)
        
    temp_df.sort_values(by=["datetime", "subject_name"], inplace=True)
    
    return temp_df["actual_consumption"]

In [None]:
df["lag_hour"] = create_lag_feature(df, hour)

In [None]:
df["lag_day"] = create_lag_feature(df, day)

In [None]:
df["lag_week"] = create_lag_feature(df, week)

In [None]:
df["lag_month"] = create_lag_feature(df, month)

In [None]:
df["lag_year"] = create_lag_feature(df, year)

In [None]:
df.head()

In [None]:
df = df[df["day_of_week"] < 6]

In [None]:
subjects = pd.get_dummies(df["subject_name"])

In [None]:
df.drop(columns=["subject_name"], inplace=True)

In [None]:
df = pd.concat([df, subjects], axis=1)

In [None]:
df.head()

In [None]:
df.drop(columns=["datetime"], inplace=True)

In [None]:
df["day_of_week"].value_counts()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df.head(3)

In [None]:
dsada

In [None]:
df.to_parquet("../prepare_data/feature_data.gzip")

In [None]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]

In [None]:
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

In [None]:
# # Предобработка данных
# scaler = MinMaxScaler()
# df['actual_consumption'] = scaler.fit_transform(df[['actual_consumption']])

# y = df['actual_consumption'].values

# # Подготовка признаков и целевой переменной
# X = df.drop('actual_consumption', axis=1).values
# # y = df['actual_consumption'].values

# # Разделение на обучающую и тестовую выборки
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# # Создание генератора данных
# class TimeSeriesGenerator(Sequence):
#     def __init__(self, X, y, batch_size, sequence_length):
#         self.X = X
#         self.y = y
#         self.batch_size = batch_size
#         self.sequence_length = sequence_length
        
#     def __len__(self):
#         return (len(self.X) - self.sequence_length) // self.batch_size
    
#     def __getitem__(self, idx):
#         start_idx = idx * self.batch_size
#         end_idx = (idx + 1) * self.batch_size
        
#         X_batch = np.array([self.X[i:i+self.sequence_length] for i in range(start_idx, end_idx)])
#         y_batch = self.y[start_idx+self.sequence_length:end_idx+self.sequence_length]
        
#         return X_batch, y_batch

# # Параметры
# sequence_length = 30
# batch_size = 256

# # Создание генераторов
# train_generator = TimeSeriesGenerator(X_train, y_train, batch_size, sequence_length)
# test_generator = TimeSeriesGenerator(X_test, y_test, batch_size, sequence_length)

# # Создание модели LSTM
# model = Sequential([
#     LSTM(64, activation='relu', input_shape=(sequence_length, X.shape[1]), return_sequences=True),
#     LSTM(32, activation='relu'),
#     Dense(16, activation='relu'),
#     Dense(1)
# ])

# # Компиляция модели
# model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# # Обучение модели
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# history = model.fit(train_generator, epochs=5, validation_data=test_generator, callbacks=[early_stopping])

# # Прогнозирование
# predictions = model.predict(test_generator)

# # Обратное преобразование
# predictions = scaler.inverse_transform(predictions)
# y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))

In [None]:
y_test_original - predictions

In [None]:
# # Подготовка данных
# def prepare_data(data, target, look_back=1):
#     X, y = [], []

#     for i in range(len(data) - look_back):
#         X.append(data[i:(i + look_back), :])
#         y.append(target[i + look_back])
    
#     return np.array(X), np.array(y)

In [None]:
# # Нормализация данных
# scaler = MinMaxScaler(feature_range=(0, 1))
# scaled_data = scaler.fit_transform(df)

In [None]:
# # Выбор целевой переменной
# target = scaled_data[:, df.columns.get_loc('actual_consumption')]
# # features = scaled_data[:, [i for i in range(scaled_data.shape[1]) if i != df.columns.get_loc('actual_consumption')]]

In [None]:
# # Подготовка данных для LSTM
# look_back = 5  # Количество предыдущих временных шагов для использования в прогнозе
# X, y = prepare_data(features, target, look_back)

# # Разделение на обучающую и тестовую выборки
# train_size = int(len(X) * 0.8)
# X_train, X_test = X[:train_size], X[train_size:]
# y_train, y_test = y[:train_size], y[train_size:]

# # Создание модели LSTM
# model = Sequential([
#     LSTM(50, activation='relu', input_shape=(look_back, features.shape[1]), return_sequences=True),
#     LSTM(50, activation='relu'),
#     Dense(1)
# ])

# # Компиляция модели
# model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# # Обучение модели
# history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# # Прогнозирование
# predictions = model.predict(X_test)

# # Обратное масштабирование для получения реальных значений
# predictions = scaler.inverse_transform(np.concatenate((X_test[:, -1, :], predictions), axis=1))[:, -1]
# y_test_inv = scaler.inverse_transform(np.concatenate((X_test[:, -1, :], y_test.reshape(-1, 1)), axis=1))[:, -1]

In [None]:
# # Оценка модели
# mse = np.mean((predictions - y_test_inv) ** 2)
# print(f"Mean Squared Error: {mse}")