In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import Binarizer, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

np.random.seed(42)
np.random.default_rng(42)
# генерация данных для каждого столбца
data = {
    'temperature_celsius': np.random.uniform(20, 35, size=100),  # температура в градусах Цельсия (float)
    'age_years': np.random.randint(18, 65, size=100),  # возраст в годах (int)
    'timestamp_event': [pd.Timestamp('20230101') + timedelta(days=i) for i in range(100)],  # время события (datetime)
    'product_category': np.random.choice(['electronics', 'clothing', 'food'], size=100),  # категория продукта (string)
    'is_purchased': np.random.choice([True, False], size=100),  # булевое значение приобретения (bool)
    'humidity_percentage': np.random.uniform(40, 80, size=100),  # влажность в процентах (float)
    'income_usd': np.random.randint(20000, 100000, size=100),  # доход в долларах США (int)
    'last_updated': [pd.Timestamp('20240101') + timedelta(days=i) for i in range(100)],  # последнее обновление (datetime)
    'product_name': ['Product_' + str(i) for i in range(100)],  # название продукта (string)
    'is_subscribed': np.random.choice([True, False], size=100)  # булевое значение подписки (bool)
}

# создание DataFrame
df = pd.DataFrame(data)

# ваш код здесь #

In [2]:
df.dtypes

temperature_celsius           float64
age_years                       int64
timestamp_event        datetime64[ns]
product_category               object
is_purchased                     bool
humidity_percentage           float64
income_usd                      int64
last_updated           datetime64[ns]
product_name                   object
is_subscribed                    bool
dtype: object

In [3]:
df_int = df.select_dtypes(include=['int'])
df_float = df.select_dtypes(include=['float'])
df_bool = df.select_dtypes(include=['bool'])
df_object = df.select_dtypes(include=['object'])
df_date = df.select_dtypes(include=['datetime'])


In [4]:
#df['income_usd_binarized'] = Binarizer(threshold=df['income_usd'].mean()).fit_transform(df.iloc[:,6:7])
#df['age_years_standarded'] = StandardScaler().fit_transform(df.iloc[:,1:2])
#df['is_subscribed_encoded'] = LabelEncoder().fit_transform(df.iloc[:,9:10])

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('bin_enc', Binarizer(threshold=df['income_usd'].mean()), [6]),
        ('standard_enc', StandardScaler(), [1]),
        ('ohe', OneHotEncoder(), [9]),

    ]
)

In [6]:
pipe = Pipeline([('preprocessor', preprocessor)])
transformed_data = pipe.fit_transform(df)

In [7]:
transformed_data

array([[ 1.        ,  0.6485111 ,  1.        ,  0.        ],
       [ 0.        ,  1.16672773,  0.        ,  1.        ],
       [ 1.        ,  0.6485111 ,  0.        ,  1.        ],
       [ 1.        , -1.42435543,  0.        ,  1.        ],
       [ 0.        ,  0.50044921,  0.        ,  1.        ],
       [ 1.        ,  1.01866584,  1.        ,  0.        ],
       [ 1.        , -0.01776743,  1.        ,  0.        ],
       [ 1.        ,  1.16672773,  0.        ,  1.        ],
       [ 1.        ,  1.61091342,  1.        ,  0.        ],
       [ 1.        , -0.61001501,  0.        ,  1.        ],
       [ 0.        ,  1.46285152,  0.        ,  1.        ],
       [ 1.        ,  0.42641826,  0.        ,  1.        ],
       [ 1.        ,  0.94463489,  1.        ,  0.        ],
       [ 0.        , -0.7580769 ,  1.        ,  0.        ],
       [ 0.        ,  0.6485111 ,  1.        ,  0.        ],
       [ 1.        , -1.20226259,  1.        ,  0.        ],
       [ 0.        , -0.

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# генерация случайных данных о температурах за год
np.random.seed(42)
np.random.default_rng(42)
start_date = pd.Timestamp('2023-01-01')
end_date = pd.Timestamp('2023-12-31')
dates = pd.date_range(start=start_date, end=end_date)
temperatures = np.random.uniform(low=-10.0, high=30.0, size=len(dates))
temperature_data = pd.DataFrame({'Date': dates, 'Temperature_Celsius': temperatures})


In [9]:

# ваш код для предобработки временных признаков #
# 1. Извлечение признаков из даты
temperature_data['Month'] = temperature_data['Date'].dt.month
temperature_data['Weekday'] = temperature_data['Date'].dt.weekday
temperature_data['Hour'] = temperature_data['Date'].dt.hour   

# 2. Скользящие окна и накопительные статистики
temperature_data['Cumulative_Sum'] = temperature_data['Temperature_Celsius'].cumsum()
#temperature_data['Rolling_Window'] = temperature_data['Temperature_Celsius'].rolling(window=7).mean()


In [10]:

# 3. Периодичность и тренды
temperature_data['Monthly_Sum'] = temperature_data.groupby('Month')['Temperature_Celsius'].transform('sum') # сумма
temperature_data['Monthly_Mean'] = temperature_data.groupby('Month')['Temperature_Celsius'].transform('mean') # среднее



In [11]:


# вывод обработанных данных
print(temperature_data.head())

        Date  Temperature_Celsius  Month  Weekday  Hour  Cumulative_Sum   
0 2023-01-01             4.981605      1        6     0        4.981605  \
1 2023-01-02            28.028572      1        0     0       33.010177   
2 2023-01-03            19.279758      1        1     0       52.289935   
3 2023-01-04            13.946339      1        2     0       66.236274   
4 2023-01-05            -3.759254      1        3     0       62.477020   

   Rolling_Window  Monthly_Sum  Monthly_Mean  
0             NaN   240.618521      7.761888  
1             NaN   240.618521      7.761888  
2             NaN   240.618521      7.761888  
3             NaN   240.618521      7.761888  
4             NaN   240.618521      7.761888  


In [12]:
temperature_data

Unnamed: 0,Date,Temperature_Celsius,Month,Weekday,Hour,Cumulative_Sum,Rolling_Window,Monthly_Sum,Monthly_Mean
0,2023-01-01,4.981605,1,6,0,4.981605,,240.618521,7.761888
1,2023-01-02,28.028572,1,0,0,33.010177,,240.618521,7.761888
2,2023-01-03,19.279758,1,1,0,52.289935,,240.618521,7.761888
3,2023-01-04,13.946339,1,2,0,66.236274,,240.618521,7.761888
4,2023-01-05,-3.759254,1,3,0,62.477020,,240.618521,7.761888
...,...,...,...,...,...,...,...,...,...
360,2023-12-27,5.526797,12,2,0,3463.041704,8.452282,269.370008,8.689355
361,2023-12-28,15.731529,12,3,0,3478.773232,11.724964,269.370008,8.689355
362,2023-12-29,8.330116,12,4,0,3487.103348,10.672585,269.370008,8.689355
363,2023-12-30,11.824672,12,5,0,3498.928020,13.638902,269.370008,8.689355
