# Создание признаков

**_______________________________________________________________________________________________________________________________**

**Загрузим всё необходимое: зависимости, сущности и модули:**

In [1]:
%run preset.py

___________________________________________________________________________________________________________________________

**Настройка текущий тетрадки (notebook):**

In [2]:
# Установка количества отображаемых строк:
pd.set_option("display.max_rows", 75)

# Установка количества отображаемых столбцов:
pd.set_option("display.max_columns", 25)

# Установка ограничения на количество отображаемых символов записи:
pd.set_option("display.max_colwidth", 45)

___________________________________________________________________________________________________________________________

**Имортируемые данные для дальнейшей работы:**

In [3]:
df = pd.read_parquet(prep_data_url)

___________________________________________________________________________________________________________________________

**Посмотрим на скаченные данные:**

In [4]:
print(f"Количество строк dataframe-а: {df.shape[0]}\nКоличество столбцов dataframe-а: {df.shape[1]}")

Количество строк dataframe-а: 8478449
Количество столбцов dataframe-а: 3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8478449 entries, 2011-01-01 00:00:00 to 2024-07-17 00:00:00
Data columns (total 3 columns):
 #   Column              Dtype  
---  ------              -----  
 0   subject_name        object 
 1   actual_consumption  float64
 2   datetime            object 
dtypes: float64(1), object(2)
memory usage: 258.7+ MB


In [6]:
df.head(3)

Unnamed: 0_level_0,subject_name,actual_consumption,datetime
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-01 00:00:00,Алтайский край,1313.0,2011-01-01 00:00:00
2011-01-01 00:00:00,Амурская область,870.0,2011-01-01 00:00:00
2011-01-01 00:00:00,Архангельская область,970.0,2011-01-01 00:00:00


___________________________________________________________________________________________________________________________

**Создание time-step (которые можно получить непосредственно из даты и времени) признаков:**

**Признак `time_dummie` - номер временного шага:**

In [7]:
def create_time_dummie_feature(feature: pd.DataFrame) -> pd.Series:
    """
    
    """
    
    temp_df = pd.DataFrame()

    for subj in df["subject_name"].unique():
        subj_df = df[df["subject_name"] == subj].copy()
        subj_df["time_dummie"] = range(1, subj_df.shape[0] + 1)
        temp_df = temp_df.append(subj_df)
        
    temp_df.sort_values(by=["datetime", "subject_name", "time_dummie"], inplace=True)
    
    return temp_df["time_dummie"]

In [8]:
df["time_dummie"] = create_time_dummie_feature(df)

In [9]:
df["year"] = df["datetime"].apply(get_year)

In [10]:
df["month"] = df["datetime"].apply(get_month)

In [11]:
df["day_of_month"] = df["datetime"].apply(get_day_month)

In [12]:
df["hour"] = df["datetime"].apply(get_hour)

In [13]:
df["day_of_week"] = df["datetime"].apply(get_day_week)

In [14]:
df["day_of_year"] = df["datetime"].apply(get_day_year)

In [15]:
df["week_of_year"] = df["datetime"].apply(get_week_year)

In [16]:
df["quarter"] = df["datetime"].apply(get_quarter)

In [17]:
df["holiday"] = df["datetime"].apply(is_holiday)

**Создание lag (отложенных по времени) признаков:**

In [18]:
def create_lag_feature(feature: pd.DataFrame, step_count: int) -> pd.Series:
    """
    
    """
    
    temp_df = pd.DataFrame()

    for subj in df["subject_name"].unique():
        subj_df = df[df["subject_name"] == subj].copy()
        subj_df["actual_consumption"] = subj_df["actual_consumption"].shift(step_count)
        temp_df = temp_df.append(subj_df)
        
    temp_df.sort_values(by=["datetime", "subject_name"], inplace=True)
    
    return temp_df["actual_consumption"]

In [19]:
df["lag_hour"] = create_lag_feature(df, hour)

In [20]:
df["lag_day"] = create_lag_feature(df, day)

In [21]:
df["lag_week"] = create_lag_feature(df, week)

In [22]:
df["lag_month"] = create_lag_feature(df, month)

In [23]:
df["lag_year"] = create_lag_feature(df, year)

In [24]:
df.head()

Unnamed: 0_level_0,subject_name,actual_consumption,datetime,time_dummie,year,month,day_of_month,hour,day_of_week,day_of_year,week_of_year,quarter,holiday,lag_hour,lag_day,lag_week,lag_month,lag_year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2011-01-01 00:00:00,Алтайский край,1313.0,2011-01-01 00:00:00,1,2011,1,1,0,6,1,52,3,1,,,,,
2011-01-01 00:00:00,Амурская область,870.0,2011-01-01 00:00:00,1,2011,1,1,0,6,1,52,3,1,,,,,
2011-01-01 00:00:00,Архангельская область,970.0,2011-01-01 00:00:00,1,2011,1,1,0,6,1,52,3,1,,,,,
2011-01-01 00:00:00,Астраханская область,504.0,2011-01-01 00:00:00,1,2011,1,1,0,6,1,52,3,1,,,,,
2011-01-01 00:00:00,Белгородская область,1592.0,2011-01-01 00:00:00,1,2011,1,1,0,6,1,52,3,1,,,,,


In [25]:
df = df[df["day_of_week"] < 6]

In [26]:
subjects = pd.get_dummies(df["subject_name"])

In [27]:
df.drop(columns=["subject_name"], inplace=True)

In [28]:
df = pd.concat([df, subjects], axis=1)

In [29]:
df.head()

Unnamed: 0_level_0,actual_consumption,datetime,time_dummie,year,month,day_of_month,hour,day_of_week,day_of_year,week_of_year,quarter,holiday,...,Тверская область,Томская область,Тульская область,Тюменская область,Удмуртская Республика,Ульяновская область,Центральный энергорайон Якутии,Челябинская область,Чеченская Республика,Чувашская Республика - Чувашия,Южно-Якутский энергорайон,Ярославская область
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2011-01-03 00:00:00,1230.0,2011-01-03 00:00:00,49,2011,1,3,0,1,3,1,3,1,...,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-03 00:00:00,995.0,2011-01-03 00:00:00,49,2011,1,3,0,1,3,1,3,1,...,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-03 00:00:00,923.0,2011-01-03 00:00:00,49,2011,1,3,0,1,3,1,3,1,...,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-03 00:00:00,451.0,2011-01-03 00:00:00,49,2011,1,3,0,1,3,1,3,1,...,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-03 00:00:00,1481.0,2011-01-03 00:00:00,49,2011,1,3,0,1,3,1,3,1,...,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
df.drop(columns=["datetime"], inplace=True)

In [31]:
df["day_of_week"].value_counts()

3    1212274
1    1211758
2    1211737
5    1210970
4    1210531
Name: day_of_week, dtype: int64

In [32]:
df.dropna(inplace=True)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5588845 entries, 2012-02-20 00:00:00 to 2024-07-17 00:00:00
Data columns (total 91 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   actual_consumption                 float64
 1   time_dummie                        int64  
 2   year                               int64  
 3   month                              int64  
 4   day_of_month                       int64  
 5   hour                               int64  
 6   day_of_week                        int64  
 7   day_of_year                        int64  
 8   week_of_year                       int64  
 9   quarter                            int64  
 10  holiday                            int64  
 11  lag_hour                           float64
 12  lag_day                            float64
 13  lag_week                           float64
 14  lag_month                          float64
 15  lag_year                           float6

In [34]:
df.head(3)

Unnamed: 0_level_0,actual_consumption,time_dummie,year,month,day_of_month,hour,day_of_week,day_of_year,week_of_year,quarter,holiday,lag_hour,...,Тверская область,Томская область,Тульская область,Тюменская область,Удмуртская Республика,Ульяновская область,Центральный энергорайон Якутии,Челябинская область,Чеченская Республика,Чувашская Республика - Чувашия,Южно-Якутский энергорайон,Ярославская область
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2012-02-20 00:00:00,1302.0,8783,2012,2,20,0,1,51,8,3,0,1339.0,...,0,0,0,0,0,0,0,0,0,0,0,0
2012-02-20 00:00:00,1091.0,8783,2012,2,20,0,1,51,8,3,0,1034.0,...,0,0,0,0,0,0,0,0,0,0,0,0
2012-02-20 00:00:00,946.0,8783,2012,2,20,0,1,51,8,3,0,982.0,...,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
df.to_parquet("../prepare_data/feature_data.gzip")

In [None]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]

In [None]:
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

In [35]:
# # Предобработка данных
# scaler = MinMaxScaler()
# df['actual_consumption'] = scaler.fit_transform(df[['actual_consumption']])

# y = df['actual_consumption'].values

# # Подготовка признаков и целевой переменной
# X = df.drop('actual_consumption', axis=1).values
# # y = df['actual_consumption'].values

# # Разделение на обучающую и тестовую выборки
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# # Создание генератора данных
# class TimeSeriesGenerator(Sequence):
#     def __init__(self, X, y, batch_size, sequence_length):
#         self.X = X
#         self.y = y
#         self.batch_size = batch_size
#         self.sequence_length = sequence_length
        
#     def __len__(self):
#         return (len(self.X) - self.sequence_length) // self.batch_size
    
#     def __getitem__(self, idx):
#         start_idx = idx * self.batch_size
#         end_idx = (idx + 1) * self.batch_size
        
#         X_batch = np.array([self.X[i:i+self.sequence_length] for i in range(start_idx, end_idx)])
#         y_batch = self.y[start_idx+self.sequence_length:end_idx+self.sequence_length]
        
#         return X_batch, y_batch

# # Параметры
# sequence_length = 30
# batch_size = 256

# # Создание генераторов
# train_generator = TimeSeriesGenerator(X_train, y_train, batch_size, sequence_length)
# test_generator = TimeSeriesGenerator(X_test, y_test, batch_size, sequence_length)

# # Создание модели LSTM
# model = Sequential([
#     LSTM(64, activation='relu', input_shape=(sequence_length, X.shape[1]), return_sequences=True),
#     LSTM(32, activation='relu'),
#     Dense(16, activation='relu'),
#     Dense(1)
# ])

# # Компиляция модели
# model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# # Обучение модели
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# history = model.fit(train_generator, epochs=5, validation_data=test_generator, callbacks=[early_stopping])

# # Прогнозирование
# predictions = model.predict(test_generator)

# # Обратное преобразование
# predictions = scaler.inverse_transform(predictions)
# y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))

In [36]:
y_test_original - predictions

NameError: name 'y_test_original' is not defined

In [None]:
# # Подготовка данных
# def prepare_data(data, target, look_back=1):
#     X, y = [], []

#     for i in range(len(data) - look_back):
#         X.append(data[i:(i + look_back), :])
#         y.append(target[i + look_back])
    
#     return np.array(X), np.array(y)

In [None]:
# # Нормализация данных
# scaler = MinMaxScaler(feature_range=(0, 1))
# scaled_data = scaler.fit_transform(df)

In [None]:
# # Выбор целевой переменной
# target = scaled_data[:, df.columns.get_loc('actual_consumption')]
# # features = scaled_data[:, [i for i in range(scaled_data.shape[1]) if i != df.columns.get_loc('actual_consumption')]]

In [None]:
# # Подготовка данных для LSTM
# look_back = 5  # Количество предыдущих временных шагов для использования в прогнозе
# X, y = prepare_data(features, target, look_back)

# # Разделение на обучающую и тестовую выборки
# train_size = int(len(X) * 0.8)
# X_train, X_test = X[:train_size], X[train_size:]
# y_train, y_test = y[:train_size], y[train_size:]

# # Создание модели LSTM
# model = Sequential([
#     LSTM(50, activation='relu', input_shape=(look_back, features.shape[1]), return_sequences=True),
#     LSTM(50, activation='relu'),
#     Dense(1)
# ])

# # Компиляция модели
# model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# # Обучение модели
# history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# # Прогнозирование
# predictions = model.predict(X_test)

# # Обратное масштабирование для получения реальных значений
# predictions = scaler.inverse_transform(np.concatenate((X_test[:, -1, :], predictions), axis=1))[:, -1]
# y_test_inv = scaler.inverse_transform(np.concatenate((X_test[:, -1, :], y_test.reshape(-1, 1)), axis=1))[:, -1]

In [None]:
# # Оценка модели
# mse = np.mean((predictions - y_test_inv) ** 2)
# print(f"Mean Squared Error: {mse}")