In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import Binarizer, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

np.random.seed(42)
np.random.default_rng(42)
# генерация данных для каждого столбца
data = {
    'temperature_celsius': np.random.uniform(20, 35, size=100),  # температура в градусах Цельсия (float)
    'age_years': np.random.randint(18, 65, size=100),  # возраст в годах (int)
    'timestamp_event': [pd.Timestamp('20230101') + timedelta(days=i) for i in range(100)],  # время события (datetime)
    'product_category': np.random.choice(['electronics', 'clothing', 'food'], size=100),  # категория продукта (string)
    'is_purchased': np.random.choice([True, False], size=100),  # булевое значение приобретения (bool)
    'humidity_percentage': np.random.uniform(40, 80, size=100),  # влажность в процентах (float)
    'income_usd': np.random.randint(20000, 100000, size=100),  # доход в долларах США (int)
    'last_updated': [pd.Timestamp('20240101') + timedelta(days=i) for i in range(100)],  # последнее обновление (datetime)
    'product_name': ['Product_' + str(i) for i in range(100)],  # название продукта (string)
    'is_subscribed': np.random.choice([True, False], size=100)  # булевое значение подписки (bool)
}

# создание DataFrame
df = pd.DataFrame(data)

# ваш код здесь #

In [2]:
df.dtypes

temperature_celsius           float64
age_years                       int64
timestamp_event        datetime64[ns]
product_category               object
is_purchased                     bool
humidity_percentage           float64
income_usd                      int64
last_updated           datetime64[ns]
product_name                   object
is_subscribed                    bool
dtype: object

In [3]:
df_int = df.select_dtypes(include=['int'])
df_float = df.select_dtypes(include=['float'])
df_bool = df.select_dtypes(include=['bool'])
df_object = df.select_dtypes(include=['object'])
df_date = df.select_dtypes(include=['datetime'])


In [4]:
#df['income_usd_binarized'] = Binarizer(threshold=df['income_usd'].mean()).fit_transform(df.iloc[:,6:7])
#df['age_years_standarded'] = StandardScaler().fit_transform(df.iloc[:,1:2])
#df['is_subscribed_encoded'] = LabelEncoder().fit_transform(df.iloc[:,9:10])

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('bin_enc', Binarizer(threshold=df['income_usd'].mean()), [6]),
        ('standard_enc', StandardScaler(), [1]),
        ('ohe', OneHotEncoder(), [9]),

    ]
)

In [6]:
pipe = Pipeline([('preprocessor', preprocessor),
                 ('clf', RandomForestClassifier(n_estimators=100))])
transformed_data = pipe.fit_transform(df)

In [7]:
transformed_data

array([[ 1.        ,  0.6485111 ,  1.        ,  0.        ],
       [ 0.        ,  1.16672773,  0.        ,  1.        ],
       [ 1.        ,  0.6485111 ,  0.        ,  1.        ],
       [ 1.        , -1.42435543,  0.        ,  1.        ],
       [ 0.        ,  0.50044921,  0.        ,  1.        ],
       [ 1.        ,  1.01866584,  1.        ,  0.        ],
       [ 1.        , -0.01776743,  1.        ,  0.        ],
       [ 1.        ,  1.16672773,  0.        ,  1.        ],
       [ 1.        ,  1.61091342,  1.        ,  0.        ],
       [ 1.        , -0.61001501,  0.        ,  1.        ],
       [ 0.        ,  1.46285152,  0.        ,  1.        ],
       [ 1.        ,  0.42641826,  0.        ,  1.        ],
       [ 1.        ,  0.94463489,  1.        ,  0.        ],
       [ 0.        , -0.7580769 ,  1.        ,  0.        ],
       [ 0.        ,  0.6485111 ,  1.        ,  0.        ],
       [ 1.        , -1.20226259,  1.        ,  0.        ],
       [ 0.        , -0.

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# генерация случайных данных о температурах за год
np.random.seed(42)
np.random.default_rng(42)
start_date = pd.Timestamp('2023-01-01')
end_date = pd.Timestamp('2023-12-31')
dates = pd.date_range(start=start_date, end=end_date)
temperatures = np.random.uniform(low=-10.0, high=30.0, size=len(dates))
temperature_data = pd.DataFrame({'Date': dates, 'Temperature_Celsius': temperatures})


In [9]:

# ваш код для предобработки временных признаков #
# 1. Извлечение признаков из даты
temperature_data['Month'] = temperature_data['Date'].dt.month
temperature_data['Weekday'] = temperature_data['Date'].dt.weekday
temperature_data['Hour'] = temperature_data['Date'].dt.hour   

# 2. Скользящие окна и накопительные статистики
temperature_data['Cumulative_Sum'] = temperature_data['Temperature_Celsius'].cumsum()
#temperature_data['Rolling_Window'] = temperature_data['Temperature_Celsius'].rolling(window=7).mean()


In [10]:

# 3. Периодичность и тренды
temperature_data['Monthly_Sum'] = temperature_data.groupby('Month')['Temperature_Celsius'].transform('sum') # сумма
temperature_data['Monthly_Mean'] = temperature_data.groupby('Month')['Temperature_Celsius'].transform('mean') # среднее



In [11]:


# вывод обработанных данных
print(temperature_data.head())

        Date  Temperature_Celsius  Month  Weekday  Hour  Cumulative_Sum   
0 2023-01-01             4.981605      1        6     0        4.981605  \
1 2023-01-02            28.028572      1        0     0       33.010177   
2 2023-01-03            19.279758      1        1     0       52.289935   
3 2023-01-04            13.946339      1        2     0       66.236274   
4 2023-01-05            -3.759254      1        3     0       62.477020   

   Rolling_Window  Monthly_Sum  Monthly_Mean  
0             NaN   240.618521      7.761888  
1             NaN   240.618521      7.761888  
2             NaN   240.618521      7.761888  
3             NaN   240.618521      7.761888  
4             NaN   240.618521      7.761888  


In [12]:
temperature_data

Unnamed: 0,Date,Temperature_Celsius,Month,Weekday,Hour,Cumulative_Sum,Rolling_Window,Monthly_Sum,Monthly_Mean
0,2023-01-01,4.981605,1,6,0,4.981605,,240.618521,7.761888
1,2023-01-02,28.028572,1,0,0,33.010177,,240.618521,7.761888
2,2023-01-03,19.279758,1,1,0,52.289935,,240.618521,7.761888
3,2023-01-04,13.946339,1,2,0,66.236274,,240.618521,7.761888
4,2023-01-05,-3.759254,1,3,0,62.477020,,240.618521,7.761888
...,...,...,...,...,...,...,...,...,...
360,2023-12-27,5.526797,12,2,0,3463.041704,8.452282,269.370008,8.689355
361,2023-12-28,15.731529,12,3,0,3478.773232,11.724964,269.370008,8.689355
362,2023-12-29,8.330116,12,4,0,3487.103348,10.672585,269.370008,8.689355
363,2023-12-30,11.824672,12,5,0,3498.928020,13.638902,269.370008,8.689355


In [27]:
import pandas as pd
import numpy as np
from datetime import datetime

# генерация данных о сотрудниках
np.random.seed(0)
employee_info = pd.DataFrame({
    'employee_id': np.arange(1, 11),
    'start_year': np.random.randint(2010, 2022, (10,))
})

def calculate_employment_duration(start_year):
    #ваш код здесь#
    cur_year = datetime.now().year
    return cur_year - start_year
employee_info['employment_duration'] = employee_info['start_year'].apply(calculate_employment_duration)


In [31]:
import pandas as pd
import numpy as np

# генерация DataFrame с температурой в градусах Цельсия
np.random.seed(0)
data = {'temperature_Celsius': np.random.randint(-20, 40, size=10)}
df = pd.DataFrame(data)


In [33]:
def far(temperature_Celsius):
    return temperature_Celsius * 9/5 + 32

vectorized = np.vectorize(far)
df['temperature_Fahrenheit'] = vectorized(df.temperature_Celsius)

In [37]:
import pandas as pd
import numpy as np

# создание DataFrame с оценками студентов
data = {'scores': np.array([92, 78, 64, 81, 53, 95, 88])}
df = pd.DataFrame(data)

# функция для перевода числовой оценки в буквенный эквивалент
def to_grade(score):
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    elif score >= 73:
        return 'C'
    elif score >= 61:
        return 'D'
    return 'F'

vectorized = np.vectorize(to_grade)
df['grades'] = vectorized(df.scores)

# FeatureTool

In [39]:
import featuretools as ft
from woodwork.logical_types import Categorical, PostalCode

data = ft.demo.load_mock_customer()
transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])
products_df = data["products"]


In [51]:
import featuretools as ft

es = ft.demo.load_mock_customer(return_entityset=True)

es = es.add_dataframe(
    dataframe_name="transactions",
    dataframe=transactions_df,
    index="transaction_id",
    time_index="transaction_time",
    logical_types={
        "product_id": Categorical,
        "zip_code": PostalCode,
    },
)

es = es.add_dataframe(
    dataframe_name="products", dataframe=products_df, index="product_id"
)
es = es.add_relationship("products", "product_id", "transactions", "product_id") 

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(


In [53]:
print(es)

Entityset: transactions
  DataFrames:
    transactions [Rows: 500, Columns: 11]
    products [Rows: 5, Columns: 2]
    sessions [Rows: 35, Columns: 5]
    customers [Rows: 5, Columns: 5]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id


In [68]:
import featuretools as ft

es = ft.demo.load_mock_customer(return_entityset=True)

feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    agg_primitives=["count",'std'],
    trans_primitives=["month"],
    max_depth=1,
) 

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  agg_primitives: ['std']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


In [69]:
feature_matrix

Unnamed: 0_level_0,zip_code,COUNT(sessions),MONTH(birthday),MONTH(join_date)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,60091,6,7,7
4,60091,8,8,4
1,60091,8,7,4
3,13244,6,11,8
2,13244,7,8,4


In [65]:
es['customers']

Unnamed: 0,customer_id,zip_code,join_date,birthday,_ft_last_time
5,5,60091,2010-07-17 05:27:50,1984-07-28,2014-01-01 08:09:40
4,4,60091,2011-04-08 20:08:14,2006-08-15,2014-01-01 05:31:30
1,1,60091,2011-04-17 10:48:33,1994-07-18,2014-01-01 07:26:20
3,3,13244,2011-08-13 15:42:34,2003-11-21,2014-01-01 09:00:35
2,2,13244,2012-04-15 23:31:04,1986-08-18,2014-01-01 08:23:45


In [70]:
ft.list_primitives()[-10:]

Unnamed: 0,name,type,dask_compatible,spark_compatible,description,valid_inputs,return_type
193,subtract_numeric_scalar,transform,True,True,Subtracts a scalar from each element in the list.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
194,expanding_min,transform,False,False,Computes the expanding minimum of events over ...,<ColumnSchema (Logical Type = Datetime) (Seman...,<ColumnSchema (Semantic Tags = ['numeric'])>
195,not_equal_scalar,transform,True,True,Determines if values in a list are not equal t...,<ColumnSchema>,<ColumnSchema (Logical Type = BooleanNullable)>
196,less_than,transform,True,True,Determines if values in one list are less than...,"<ColumnSchema (Logical Type = Datetime)>, <Col...",<ColumnSchema (Logical Type = BooleanNullable)>
197,negate,transform,True,True,Negates a numeric value.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
198,url_to_protocol,transform,False,False,Determines the protocol (http or https) of a url.,<ColumnSchema (Logical Type = URL)>,<ColumnSchema (Logical Type = Categorical) (Se...
199,diff,transform,False,False,Computes the difference between the value in a...,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
200,numeric_lag,transform,False,False,Shifts an array of values by a specified numbe...,<ColumnSchema (Semantic Tags = ['time_index'])...,<ColumnSchema (Semantic Tags = ['numeric'])>
201,minute,transform,True,True,Determines the minutes value of a datetime.,<ColumnSchema (Logical Type = Datetime)>,"<ColumnSchema (Logical Type = Ordinal: [0, 1, ..."
202,email_address_to_domain,transform,False,False,Determines the domain of an email,<ColumnSchema (Logical Type = EmailAddress)>,<ColumnSchema (Logical Type = Categorical) (Se...


In [None]:
from featuretools.primitives import TransformPrimitive
from woodwork.column_schema import ColumnSchema

class SquareRoot(TransformPrimitive):
    name = 'square_root'
    input_types = [ColumnSchema(semantic_tags={"numeric"})]    
    return_type = ColumnSchema(semantic_tags={"numeric"})

    def get_function(self):
        def square_root(column):
            return np.sqrt(column)

        return square_root

In [None]:
from featuretools.primitives import AggregationPrimitive
from woodwork.column_schema import ColumnSchema

class Mean(AggregationPrimitive):
    name = 'mean'
    input_types = [ColumnSchema(semantic_tags={"numeric"})]    
    return_type = ColumnSchema(semantic_tags={"numeric"})

    def get_function(self):
        def mean(column):
            return np.mean(column)

        return mean