In [None]:
%pip install fastparquet

In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

In [31]:
df = pd.read_parquet("../data/train_final_dataset.snappy.parquet", engine="fastparquet")

In [32]:
df['image_similarity'] = df['image_similarity'].fillna(0)

df[['is_same_location', 'is_same_region']] = df[['is_same_location', 'is_same_region']].astype(int)

In [33]:
# Совпадение категорий и параметров
df['same_category'] = (df['base_category_name'] == df['cand_category_name']).astype(int)
df['same_subcategory'] = (df['base_subcategory_name'] == df['cand_subcategory_name']).astype(int)

In [34]:
df['price_diff_pct'] = 2 * abs(df['base_price'] - df['cand_price']) / (df['base_price'] + df['cand_price'] + 1e-6)

df['images_diff'] = df['base_count_images'] - df['cand_count_images']

In [35]:
numeric_cols = [
    'base_price', 
    'cand_price',
    'price_diff_pct',
    'base_count_images',
    'cand_count_images',
    'images_diff',
    'common_params_count',
    'same_values_count'
]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [36]:
df['base_param1'] = df['base_param1'].replace('', 'Не указано')
df['cand_param1'] = df['cand_param1'].replace('', 'Не указано')
df['same_param1'] = (df['base_param1'] == df['cand_param1']).astype(int)
le_param1 = LabelEncoder()
all_values = pd.concat([df['base_param1'], df['cand_param1']]).unique()
le_param1.fit(all_values)

df['base_param1_encoded'] = le_param1.transform(df['base_param1'])
df['cand_param1_encoded'] = le_param1.transform(df['cand_param1'])


In [37]:
df['base_param2'] = df['base_param2'].replace('', 'Не указано')
df['cand_param2'] = df['cand_param2'].replace('', 'Не указано')
df['same_param2'] = (df['base_param2'] == df['cand_param2']).astype(int)
le_param2 = LabelEncoder()
all_values = pd.concat([df['base_param2'], df['cand_param2']]).unique()
le_param2.fit(all_values)

df['base_param2_encoded'] = le_param2.transform(df['base_param2'])
df['cand_param2_encoded'] = le_param2.transform(df['cand_param2'])

In [39]:
columns_to_drop = [
    'base_item_id', 'cand_item_id', 'group_id', 'action_date',
    'base_title', 'cand_title', 'base_description', 'cand_description',
    'base_json_params', 'cand_json_params',
    'base_title_image', 'cand_title_image',
    'base_category_name', 'cand_category_name',
    'base_subcategory_name', 'cand_subcategory_name',
    'base_param1', 'cand_param1',
    'base_param2', 'cand_param2',
]

df = df.drop(columns=columns_to_drop)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 0 to 499999
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   base_price              500000 non-null  float64
 1   cand_price              500000 non-null  float64
 2   base_count_images       500000 non-null  float64
 3   cand_count_images       500000 non-null  float64
 4   is_same_location        500000 non-null  int32  
 5   is_same_region          500000 non-null  int32  
 6   is_double               500000 non-null  int64  
 7   common_params_count     500000 non-null  float64
 8   same_values_count       500000 non-null  float64
 9   image_similarity        500000 non-null  float64
 10  basic_image_similarity  500000 non-null  float64
 11  title_similarity        500000 non-null  float64
 12  description_similarity  500000 non-null  float64
 13  same_category           500000 non-null  int32  
 14  same_subcategory        5

In [None]:
features = [
    'base_price', 'cand_price', 'price_diff_pct',
    'base_count_images', 'cand_count_images', 'images_diff',
    
    'title_similarity', 'description_similarity',
    'image_similarity', 'basic_image_similarity',
    
    'same_category', 'same_subcategory',
    'common_params_count', 'same_values_count',
    
    'is_same_location', 'is_same_region',

    'same_param1', 'base_param1_encoded', 'cand_param1_encoded',
    'same_param2', 'base_param2_encoded', 'cand_param2_encoded'
]

#балансировку нужно еще добавить

X = df[features]
y = df['is_double']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)