In [48]:
from itertools import count

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# числовые
from sklearn.preprocessing import MinMaxScaler # Импортируем нормализацию от scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

# категориальные
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

DF = pd.read_csv('https://raw.githubusercontent.com/dayekb/mpti_ml/main/data/cars_moldova_no_dup_no_outliers.csv', delimiter=',')
# DF.info()

# посчитаем число числовых и категориальных колонок
cat_column = []
num_columns = []

for column_name in DF.columns:
    if (DF[column_name].dtype == object):
        cat_column += [column_name]
    else:
        num_columns += [column_name]

# print("Категориальные колонки:\t " , cat_column, " их количество: " , len(cat_column))
# print("Числовые колонки:\t", num_columns, " их количество: " , len(num_columns))

# fig, axs = plt.subplots(1,4,figsize=(20,4))

# DF.hist(column=num_columns, ax=axs)

# DF.describe()

class QuantileReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.05):
        self.threshold = threshold
        self.quantiles = {}

    def fit(self, X, y=None):
        for col in X.select_dtypes(include='number'):
            low_quantile = X[col].quantile(self.threshold)
            high_quantile = X[col].quantile(1 - self.threshold)
            self.quantiles[col] = (low_quantile, high_quantile)
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in X.select_dtypes(include='number'):
            low_quantile, high_quantile = self.quantiles[col]
            rare_mask = ((X[col] < low_quantile) | (X[col] > high_quantile))
            if rare_mask.any():
                rare_values = X_copy.loc[rare_mask, col]
                replace_value = np.mean([low_quantile, high_quantile])
                if rare_values.mean() > replace_value:
                    X_copy.loc[rare_mask, col] = high_quantile
                else:
                    X_copy.loc[rare_mask, col] = low_quantile
        return X_copy
    
qr = QuantileReplacer(threshold=0.01)
qr.fit(DF[num_columns])
DF_num_rare = qr.transform(DF[num_columns])

# DF_num_rare.describe()

# обработка категориальных признаков

ordinal = OrdinalEncoder()
ordinal.fit(DF[cat_column])

# прменяем трансформацию, пакуем результат в Dataframe
Ordinal_encoded = ordinal.transform(DF[cat_column])
df_ordinal = pd.DataFrame(Ordinal_encoded, columns = cat_column)

#One-hot кодирование многозначных признаков
ohe = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
ohe.fit(DF[cat_column])

ohe_feat = ohe.transform(DF[cat_column])
df_ohe = pd.DataFrame(ohe_feat, columns = ohe.get_feature_names_out()).astype(int)

class RareGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.05, other_value='Other'):
        self.threshold = threshold
        self.other_value = other_value
        self.freq_dict = {}

    def fit(self, X, y=None):
        for col in X.select_dtypes(include=['object']):
            freq = X[col].value_counts(normalize=True)
            self.freq_dict[col] = freq[freq >= self.threshold].index.tolist()
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in X.select_dtypes(include=['object']):
            X_copy[col] = X_copy[col].apply(lambda x: x if x in self.freq_dict[col] else self.other_value)
        return X_copy

rg = RareGrouper(threshold=0.001, other_value='rare')
DF_cat_rare = rg.fit_transform(DF[cat_column])

# Pipeline

num_pipe_distance = Pipeline([
    ('QuantReplace', QuantileReplacer(threshold=0.01, )),
    ('scaler', StandardScaler())
])

num_distance = ['Distance']

num_pipe_engine = Pipeline([
    ('scaler', StandardScaler())
])

num_engine = ['Engine_capacity(cm3)']

num_pipe_year_price = Pipeline([
    ('power', PowerTransformer())
])

num_year_price = ['Year','Price(euro)']

cat_pipe_transmission = Pipeline([
    ('encoder', OrdinalEncoder())

])

cat_transmission = ['Transmission']

cat_pipe_style_fuel = Pipeline([
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))

])

cat_style_fuel = ['Style', 'Fuel_type']

cat_pipe_make = Pipeline([
    ('replace_rare', RareGrouper(threshold=0.001, other_value='Other')),
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
])

cat_make = ['Make']

cat_pipe_model = Pipeline([
    ('replace_rare', RareGrouper(threshold=0.0001, other_value='Other')),
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
])

cat_model = ['Model']

#Объединяем в единый Трансформер Колонок Данных
preprocessors = ColumnTransformer(transformers=[
    ('num_distance', num_pipe_distance, num_distance),
    ('num_engine', num_pipe_engine, num_engine),
    ('num_year_price', num_pipe_year_price, num_year_price),
    ('cat_transmission', cat_pipe_transmission, cat_transmission),
    ('cat_style_fuel', cat_pipe_style_fuel, cat_style_fuel),
    ('cat_make', cat_pipe_make, cat_make),
    ('cat_model', cat_pipe_model, cat_model),
])
# обучаем
preprocessors.fit(DF)

cat_style_fuel_names = preprocessors.transformers_[4][1]['encoder'].get_feature_names_out(cat_style_fuel)
cat_make_names =  preprocessors.transformers_[5][1]['encoder'].get_feature_names_out(cat_make)
cat_model_names =  preprocessors.transformers_[6][1]['encoder'].get_feature_names_out(cat_model)

# объединяем названия колонок в один список (важен порядок как в ColumnTransformer)
columns = np.hstack([num_distance,
                    num_engine,
                    num_year_price,
                    cat_transmission,
                    cat_style_fuel_names,
                    cat_make_names,
                    cat_model_names])


DF_transformed = preprocessors.transform(DF)
pd.DataFrame(DF_transformed, columns=columns)


Unnamed: 0,Distance,Engine_capacity(cm3),Year,Price(euro),Transmission,Style_Cabriolet,Style_Combi,Style_Coupe,Style_Crossover,Style_Hatchback,...,Model_ZOE,Model_Zafira,Model_i10,Model_i20,Model_i3,Model_i30,Model_i40,Model_iQ,Model_ix20,Model_ix35
0,0.132322,-0.055256,0.337261,0.180634,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.522452,-0.535081,0.822301,0.296374,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.795275,-0.535081,0.495283,-0.013909,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.186886,-0.375140,-0.401125,-0.532149,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.278176,0.584512,-1.183001,-0.702638,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32480,-1.035358,-0.055256,1.164504,0.655393,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32481,0.459709,-0.055256,-0.934177,-0.456188,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32482,-1.024445,-0.535081,0.991462,0.062383,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32483,2.042079,0.264628,-0.401125,-0.558533,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
