In [134]:
import pandas as pd
import numpy as np
import re
from dirty_cat import MinHashEncoder
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import json
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

In [135]:
#%%script false
main_data = pd.read_csv('data/data.csv', sep = ',')

In [136]:
data = main_data.copy()
data.shape

(377184, 18)

In [137]:

class PrivatePoolTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        X['PrivatePool'] = X['private pool'].combine_first(X['PrivatePool'])
        X.drop('private pool', axis=1, inplace=True)
        X['PrivatePool'] = X['PrivatePool'].map({'yes': True, 'Yes': True}).fillna(False).astype(bool)
        return X


In [138]:
class FireplaceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        X['count_fireplace'] = X['fireplace'].apply(lambda x: self._handle_missing(x) if isinstance(x, str) else np.nan)
        X['count_fireplace'] = X['count_fireplace'].fillna(X['fireplace'].apply(lambda x: self._extract_number(x) if isinstance(x, str) else np.nan))
        X['count_fireplace'] = X['count_fireplace'].fillna(X['fireplace'].apply(lambda x: self._count_rooms(x) if isinstance(x, str) else np.nan))
        X['count_fireplace'] = X['count_fireplace'].fillna(0)

        X['fireplace_type'] = X['fireplace'].apply(self._identify_fireplace_type)
        X.drop('fireplace', axis=1, inplace=True)
        return X


    def _extract_number(self, value: str):
        value = value.lower()
        text_to_number = {
            'one': 1, 'two': 2, 'three': 3, 'four': 4,
            'five': 5, 'six': 6, 'seven': 7, 'eight': 8,
            'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12
        }
        for text, num in text_to_number.items():
            if text in value:
                return num
        match = re.search(r'(\d+)(?=\+)?', value)
        if match:
            return int(match.group(1))
        elif 'yes' in value.lower():
            return 1
        else:
            return np.nan
    
    def _count_rooms(self, value: str):
        room_types = ['Family', 'Bedroom', 'Living', 'Den', 'Kitchen', 'Dining', 'FAMILYRM',
                      'Library', 'Study', 'Playroom', 'Recreation', 'Sitting', 'Guest', 'Office']
        count = sum(value.count(room) for room in room_types)
        return count if count > 0 else 1
    
    def _handle_missing(self, value: str):
        if isinstance(value, str) and value in ['N/K', 'No', 'Not Applicable', 'None', 'Non-Functional', 'Inoperative', 'Extra Closets']:
            return 0

    def _identify_fireplace_type(self, value: str):
        if isinstance(value, str):
            value = value.lower()
            if 'gas' in value:
                return 1
            if 'wood' in value or 'burning' in value:
                return 2
            if 'electric' in value:
                return 3
        return 0



In [139]:
class StatusToNumberTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.status_groups = {
            'For Sale/Active': [
                'for sale', 'active', 'new construction', 'new', 'active under contract', 
                'active with offer', 'active/contingent', 'active - auction', 'active with contract', 
                'auction active', 'auction', 'back on market', 'listing extended', 'active option', 
                'price change', 'a active', 'temporary active'
            ],
            'Pending/Under Contract': [
                'pending', 'under contract', 'contingent', 'pending continue to show', 'p',
                'option pending', 'pending taking backups', 'under contract show', 'under contract showing', 
                'under contract backups', 'pending backup wanted', 'pending take backups', 
                'pending continue show', 'pending inspection', 'due diligence period', 'p pending sale',
                'active with contingencies', 'pending ab', 'contingent finance and inspection',
                'contingent show', 'contingent take backup', 'pf', 'under contract showing', 
                'c', 'ct', 'pending - continue to show', 'pending (do not show)',
                'pending - backup offer requested', 'pending w/backup wanted', 'option contract',
                'pending - taking backups', 'offer pending signature', 'pending fe',
                'pending w/insp finance', 'uc continue to show', 'contingency contract',
                'under contract - show', 'pending offer approval', 'contingent escape',
                'pending with contingencies', 'contingent - financing', 'contract contingent on buyer sale',
                'pending, continue to show', 'pending bring backup', 'pending w/ escape clause',
                'pending - continue to show', 'pending sh', 'pending w/ cont.', 
                'pending continue to show   financing', 'pending inspection', 
                'under contract taking back up offers', 'backup contract', 'backup',
                'contract p', 'contingency 48 hr (+/ )', 'conting accpt backups',
                'contingent release', 'contingent lien holder release', 'contingent - sale of home',
                'pending sale', 'pending - continue to show', 'contingent foreclosure','under contract   showing'
            ],
            'Foreclosure/Auction': [
                'foreclosure', 'pre-foreclosure', 'foreclosed', 'auction', 'pre-foreclosure / auction', 
                'auction - active', ' / auction', 'pending auction', 'auction - active', 'foreclosure auction'
            ],
        }

    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        X['status'] = X['status'].apply(self._status_to_number)
        return X

    def _status_to_number(self, value: str):
        if pd.isna(value):
            return 0 # Создадим отдельный статус для пропущенных значений
        
        value = value.lower()
        for index, (key, statuses) in enumerate(self.status_groups.items()):
            statuses_lower = [status.lower() for status in statuses]
            
            if value in statuses_lower:
                return index + 1
        
        return 4  # Возвращаем 4, если статус не найден
    
   


In [140]:
class StoriesTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.word_to_num={
        "one half":0.5, "one": 1, "two": 2, "three": 3, "four": 4, 
        "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
        "duplex":2, "ground":1, "triplex":3, 'tri':3
    }
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        X['stories'] = X['stories'].apply(self._process_stories)
        return X

    def _process_stories(self, story: str):
        if pd.isna(story):
            return np.nan  # Создадим отдельный статус для пропущенных значений


        # Step 2: Process known words
        stories = 0

        # Convert the story to lower case for consistency
        story = story.lower()

        # Replace known words with numbers
        for word, num in self.word_to_num.items():
            story = story.replace(word, str(num))

        # Extract numbers after replacing words
        numbers = re.findall(r'\d+\.?\d*', story)
        if numbers:
            stories += sum([float(num) for num in numbers]) / len(numbers)

        # Add 0.5 if "Basement" or related term is mentioned
        story = story.replace('-', ' ')
        if "basement" in story or "tri level" in story:
            stories += 0.5

        # Mark all unprocessed stories as "Unknown"
        return stories if stories > 0 else np.nan


In [141]:

class PropertyTypeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.replacements = {
            r'single[\s_-]*family[\s_-]*home|single[\s_-]*family[\s_-]*': 'single_family_home',
            r'condo|condo/townhome.*|coop.*': 'condo',
            r'townhouse|row home|townhome': 'townhouse',
            r'multi[\s_-]*family': 'multi_family_home',
            r'lot/land|land|farms/ranches': 'land',
            r'mobile': 'mobile_home',
            r'apartment': 'apartment',
            r'ranch': 'ranch',
            r'contemporary|modern|contemporary/modern': 'contemporary_modern',
            r'colonial': 'colonial',
            r'traditional': 'traditional',
            r'1 story|one story': 'one_story',
            r'2 stories|two story': 'two_story',
            r'other': 'other'
        }

        self.cat_type_encoder = joblib.load('pkl/property_type_encoder.pkl')
    
    def fit(self, X: pd.DataFrame, y=None):
        # В методе fit ничего не делаем, так как энкодер уже обучен
        return self
    
    def transform(self, X: pd.DataFrame):
        X['propertyType'] = X['propertyType'].str.lower().str.strip()
        X['propertyType'] = X['propertyType'].apply(self.clean_property_type)
        
        # Применение замен
        for pattern, replacement in self.replacements.items():
            X['propertyType'] = X['propertyType'].replace(to_replace=pattern, value=replacement, regex=True)
        
        X['propertyType'] = self.cat_type_encoder.transform(X[['propertyType']])
        return X
    
    def clean_property_type(self, pt):
        if pd.isna(pt):
            return 'missing'  # Создадим отдельный статус для пропущенных значений

        pt = re.sub(r'\b(\w+)(_\1\b)+', r'\1', pt)
        pt = re.sub(r'\b(\w+)(/\1\b)+', r'\1', pt)
        
        pt = re.sub(r'\s+/|/\s+', '/', pt)
        pt = re.sub(r'\s+', '_', pt)
        pt = re.sub(r'(_{2,})', '_', pt)
        pt = re.sub(r'(_home)+', '_home', pt)
        pt = re.sub(r'(_story)+', '_story', pt)
        pt = re.sub(r'\bhome_home\b', 'home', pt)
        
        return pt

In [142]:

class HomeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['Cooling', 'Heating', 'Parking', 'Price/sqft', 'Remodeled year', 'Year built', 'lotsize']

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        if 'homeFacts' not in X.columns:
            raise ValueError("DataFrame must contain 'homeFacts' column")

        # Применение функции для извлечения признаков
        new_columns = X['homeFacts'].apply(self.extract_home_features)
        new_df = pd.DataFrame(new_columns.tolist())
        X.drop('homeFacts', inplace=True, axis=1)
        
        return X.join(new_df)

    def extract_home_features(self, home_facts_str: str):
        if pd.isna(home_facts_str):
            return {col: np.nan for col in self.columns}
        
        try:
            home_facts_str = home_facts_str.replace("'", '"')
            home_facts_str = home_facts_str.replace('"closet"-Electric', 'closet-Electric')
            home_facts_str = re.sub(r'(?<!")\bNone\b(?!")', '"None"', home_facts_str)
            home_facts_str = re.sub(r'(?<=[a-z-A-Z])"(?=[a-zA-Z])', '/', home_facts_str)

            home_facts_dict = json.loads(home_facts_str)
            facts = home_facts_dict.get('atAGlanceFacts', [])
            features = {fact['factLabel']: fact['factValue'] for fact in facts}
            return {col: features.get(col, np.nan) for col in self.columns}
        
        except json.JSONDecodeError as e:
            print("Ошибка декодирования JSON:")
            print(f"Сообщение об ошибке: {e}")
            print(f"Проблемная строка JSON: {home_facts_str}")
            return {col: np.nan for col in self.columns}

In [143]:

class SchoolFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['Average Rating', 'Highest Rating', 'Lowest Rating']

    def fit(self, X: pd.DataFrame, y=None):
        # Никаких операций обучения не требуется
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Применение функции для извлечения признаков
        new_columns = X['schools'].apply(self.extract_school_features)
        new_df = pd.DataFrame(new_columns.tolist(), columns=self.columns)
        X.drop('schools', inplace=True, axis=1)


        return X.join(new_df)

    def extract_school_features(self, school_facts_str: str):
        if pd.isna(school_facts_str):
            return {
                'Average Rating': np.nan
          
            }

        try:
        
            school_facts_str = school_facts_str.replace("'", '"')
            school_facts_str = re.sub(r'(?<!")\bNone\b(?!")', '"None"', school_facts_str)
            school_facts_str = re.sub(r'NR', 'None', school_facts_str)
            school_facts_str = re.sub(r'NA', 'None', school_facts_str)
            school_facts_str = re.sub(r'(?<=[a-z-A-Z])"(?=[a-zA-Z])', '/', school_facts_str)
            school_facts_str = re.sub(r', "name": \[[^\]]*\]', '', school_facts_str)

            features = json.loads(school_facts_str)[0]
            #data = features.get('data', []) 

            try: #отвечает за рейтинг школ
                ratings = []
                for rating in features['rating']:
                    if 'None' not in rating:
                        rating_value = rating.split('/')[0]  # Берем значение до '/'
                        ratings.append(int(rating_value))
            except:
                ratings=[]

    

            return {
                'Average Rating': np.mean(ratings) if ratings else np.nan
            }

        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            print(f'Problematic string: {school_facts_str}')
            return {
                'Average Rating': np.nan
            }
        except Exception as err: 
            print(err)

In [144]:
class fill_sqft():
    def __init__(self, cols):
        self.cols_to_check = cols


    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.fill_sqft_from_columns(X, self.cols_to_check)

    def extract_sqft(self, text):
        if pd.isna(text):
            return np.nan
        text = str(text).replace(',', '')  # Удаляем запятые
        match = re.search(r'(\d+)\s?sqft', text)  # Ищем число, за которым идет 'sqft'
        if match:
            return float(match.group(1))  # Возвращаем найденное число как float
        return np.nan

    # Заполнение пропущенных значений в столбце sqft
    def fill_sqft_from_columns(self, X, cols_to_check):
        for col in cols_to_check:
            # Ищем информацию о площади в каждом указанном столбце
            X['sqft'] = X.apply(
                lambda row: self.extract_sqft(row[col]) if pd.isna(row['sqft']) else row['sqft'], 
                axis=1
            )
        return X

In [145]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X: pd.DataFrame, y=None):
        # Никаких операций обучения не требуется
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Проверка наличия всех колонок в DataFrame
        for col in self.columns:
            if col not in X.columns:
                raise ValueError(f"DataFrame must contain column '{col}'")
            else:
                X[col] = X[col].apply(self.give_number)
        
       
        return X

    def give_number(self, pt):
        if pd.isna(pt):
            return np.nan
        pt = str(pt).replace(',', '')
        match = re.search(r'\d+\.?\d*', str(pt)) 
        if match:
            return float(match.group())  
        else:
            return np.nan

In [146]:
class ParkingPriceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Преобразуем столбец 'Parking'
        X['Parking'] = X['Parking'].apply(self.extract_number_second)
        X['Parking'] = X['Parking'].fillna(0)

        # Преобразуем столбец 'Price/sqft'
        X['Price/sqft'] = X['Price/sqft'].apply(self.extract_number_second).astype(float)

        return X

    @staticmethod
    def extract_number_second(value):
        if pd.isna(value):
            return np.nan  # Для пропущенных значений

        # Удаление пробелов и приведение к нижнему регистру
        value = value.strip().lower()

        # Проверка на отсутствие данных
        if 'no data' in value or 'no parking' in value:
            return 0

        # Поиск числа в строке
        match = re.search(r'\b(\d+)\s*spaces?\b', value)
        if match:
            return int(match.group(1))

        # Поиск явного числа в строке
        match = re.search(r'\b(\d+)\b', value)
        if match:
            return int(match.group(1))

        # Если не удалось найти данные
        return None

In [147]:
class CustomImputerOutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, quantile_low=0.10, quantile_high=0.90, iqr_multiplier=1.5):
        self.quantile_low = quantile_low
        self.quantile_high = quantile_high
        self.iqr_multiplier = iqr_multiplier

    def fit(self, X, y=None):
  
        # Сохраняем медиану и вычисляем границы для выбросов
        self.median_ = X['stories'].median()
        #new_X = X['stories'].fillna(self.median_)
        time_data=X['stories'].fillna(self.median_ )

        Q1 = time_data.quantile(self.quantile_low)
        Q3 = time_data.quantile(self.quantile_high)
        self.IQR_ = Q3 - Q1
        self.lower_bound_ = Q1 - self.iqr_multiplier * self.IQR_
        self.upper_bound_ = Q3 + self.iqr_multiplier * self.IQR_
        return self

    def transform(self, X):
        if 'stories' not in X.columns:
            raise ValueError("Column 'stories' not found in the input DataFrame")
        # Заполняем пропуски медианным значением
        X['stories'] = X['stories'].fillna(self.median_)
        # Удаляем выбросы
        X = X[(X['stories'] >= self.lower_bound_) & (X['stories'] <= self.upper_bound_)]
        #X.drop('Remodeled year', axis=1, inplace=True)
        return X

In [148]:
class ParkingMinHashEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=5, fill_value='No Data'):
        self.n_components = n_components
        self.fill_value = fill_value
        self.minhash_encoder = MinHashEncoder(n_components=self.n_components)

    def fit(self, X: pd.DataFrame, y=None):
        # Заполнение пропусков и обучение MinHashEncoder
        X_filled = X['Parking'].fillna(self.fill_value)
        self.minhash_encoder.fit(X_filled.values.reshape(-1, 1))
        return self

    def transform(self, X: pd.DataFrame):
        # Заполнение пропусков и применение MinHashEncoder
        X_filled = X['Parking'].fillna(self.fill_value)
        encoded_parking = self.minhash_encoder.transform(X_filled.values.reshape(-1, 1))

        # Преобразование результата в DataFrame
        encoded_parking_df = pd.DataFrame(
            encoded_parking, 
            columns=[f'Parking_{i}' for i in range(encoded_parking.shape[1])]
        )

        # Сброс индекса у обоих DataFrame
        X.reset_index(drop=True, inplace=True)
        encoded_parking_df.reset_index(drop=True, inplace=True)

        # Удаление старого столбца Parking и объединение с новым
        X_encoded = pd.concat([X, encoded_parking_df], axis=1)
        X_encoded.drop('Parking', axis=1, inplace=True)
        return X_encoded

In [149]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import re
from dirty_cat import MinHashEncoder

class CoolingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
        self.encoder = MinHashEncoder(n_components=self.n_components)

    def fit(self, X: pd.DataFrame, y=None):
        # Обучаем только MinHashEncoder на столбце 'Cooling'
        X['Cooling_Categorized'] = X['Cooling'].apply(self.process_cooling)
        self.encoder.fit(X[['Cooling_Categorized']])
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Преобразуем столбец 'Cooling'
        X['Cooling_Categorized'] = X['Cooling'].apply(self.process_cooling)

        # Заполняем пропущенные значения модой
        X['Cooling_Categorized'] = X['Cooling_Categorized'].fillna(X['Cooling_Categorized'].mode()[0])

        # Применяем MinHashEncoder
        encoded = self.encoder.transform(X[['Cooling_Categorized']])

        # Создаем DataFrame с закодированными признаками
        encoded_df = pd.DataFrame(encoded, columns=[f'Cooling_{i}' for i in range(encoded.shape[1])])

        # Объединяем закодированные признаки с исходным DataFrame
        X.reset_index(drop=True, inplace=True)
        X = pd.concat([X, encoded_df], axis=1)

        # Удаляем исходные столбцы
        X.drop(['Cooling', 'Cooling_Categorized'], axis=1, inplace=True)

        return X

    @staticmethod
    def process_cooling(cooling_str):
        """
        Функция для обработки столбца 'Cooling', которая классифицирует варианты систем охлаждения.
        """
        if pd.isna(cooling_str):
            return np.nan  # Для пропущенных значений

        cooling_str = cooling_str.lower()  # Приведение к нижнему регистру
        
        # Категории и ключевые слова для каждой категории
        if re.search(r'central|zoned', cooling_str):
            return 'Central A/C'
        elif re.search(r'window|wall', cooling_str):
            return 'Window/Wall Unit'
        elif re.search(r'heat pump', cooling_str):
            return 'Heat Pump'
        elif re.search(r'no a/c|no cooling', cooling_str):
            return 'No A/C'
        elif re.search(r'fan', cooling_str):
            return 'Fan'
        elif re.search(r'geothermal', cooling_str):
            return 'Geothermal'
        elif re.search(r'evaporative', cooling_str):
            return 'Evaporative Cooler'
        elif re.search(r'solar', cooling_str):
            return 'Solar A/C'
        else:
            return 'Other'


In [150]:

class HeatingFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=2):
        self.n_components = n_components
        self.encoder = MinHashEncoder(n_components=self.n_components)
    
    def fit(self, X: pd.DataFrame, y=None):
        # Подготовка данных и обучение энкодера
        if 'Heating' in X.columns:
            X['Heating_Categorized'] = X['Heating'].apply(self.categorize_heating)
            self.encoder.fit(X[['Heating_Categorized']])
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        if 'Heating' in X.columns:
            # Преобразование значений
            X['Heating_Categorized'] = X['Heating'].apply(self.categorize_heating)
            
            # Кодирование категорий
            encoded = self.encoder.transform(X[['Heating_Categorized']])
            encoded_df = pd.DataFrame(encoded, columns=[f'Heating_{i}' for i in range(encoded.shape[1])])
            
            # Объединение с исходным DataFrame
            X = X.drop(['Heating', 'Heating_Categorized'], axis=1, errors='ignore')
            X = pd.concat([X, encoded_df], axis=1)
        return X

    @staticmethod
    def categorize_heating(heating):
        if isinstance(heating, str):
            heating = heating.lower()
            # Газовое отопление
            if any(x in heating for x in ['gas', 'natural gas', 'propane']):
                return 'Gas'
            # Электрическое отопление
            elif any(x in heating for x in ['electric', 'electricity']):
                return 'Electric'
            # Тепловой насос
            elif 'heat pump' in heating:
                return 'Heat Pump'
            # Печное отопление
            elif any(x in heating for x in ['stove', 'wood', 'pellet']):
                return 'Stove'
            # Радиационное отопление
            elif 'radiant' in heating or 'radiator' in heating:
                return 'Radiant'
            # Центральное отопление
            elif 'central' in heating:
                return 'Central'
            # Другие типы
            elif 'solar' in heating:
                return 'Solar'
            elif 'fireplace' in heating:
                return 'Fireplace'
            elif 'baseboard' in heating:
                return 'Baseboard'
            elif 'steam' in heating:
                return 'Steam'
            elif 'wall' in heating:
                return 'Wall'
            elif 'forced air' in heating:
                return 'Forced Air'
            else:
                return 'Other'
        return np.nan

In [151]:

class MinHashFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, n_components=2):
        self.columns = columns
        self.n_components = n_components

    def fit(self, X: pd.DataFrame, y=None):
        self.encoders_ = {}
        for col in self.columns:
            encoder = MinHashEncoder(n_components=self.n_components)
            self.encoders_[col] = encoder.fit(X[[col]])
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()  # Не изменяем оригинальный DataFrame
        for col in self.columns:
            encoder = self.encoders_.get(col)
            if encoder:
                encoded = encoder.transform(X[[col]])
                encoded_df = pd.DataFrame(encoded, columns=[f'{col}_{i}' for i in range(encoded.shape[1])])
                X = pd.concat([X, encoded_df], axis=1)
        X = X.drop(self.columns, axis=1, errors='ignore')
        return X

In [152]:
class YearFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame, y=None):
        # Никаких операций обучения не требуется
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Создаем копию данных, чтобы избежать изменений оригинала
        X = X.copy()
        
        year_mode = X['Year built'].mode()[0]
        X['Year built'] = X['Year built'].fillna(year_mode)
        
        
        X = X.loc[(X['Year built'] >= 1850) & (X['Year built'] <= 2024)].copy()
        X.reset_index(drop=True, inplace=True)

        # Добавляем новые признаки
        X['is_remodeled'] = (X['Remodeled year'] > X['Year built']).fillna(False).astype(bool)
        X['property_age'] = 2024 - X['Year built']
        
        # Удаляем старые столбцы
        X.drop(['Year built', 'Remodeled year'], axis=1, inplace=True)

        return X

In [153]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
import xgboost as xgb
import numpy as np
import pandas as pd


class InplaceColTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features, target):
        self.features = features
        self.target = target

    def fit(self, X: pd.DataFrame, y=None):
        # Фиттинг модели для заполнения пропущенных значений
        self.model = self._fit_xgboost(X, self.features, self.target)
        self._baseline_model(X, self.features, self.target)
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Заполнение пропущенных значений
        X = self._fillna_optimized(X, self.model, self.features, self.target)
        return X

    def _fit_xgboost(self, data: pd.DataFrame, columns: list, target: str):
        df = data[columns + [target]]
        X_missing = df[df[target].isnull()]
        X_missing.reset_index(drop=True, inplace=True)

        Xy = df.dropna(subset=[target])
        Xy.reset_index(drop=True, inplace=True)

        X = Xy[columns]
        y = Xy[target]

        param_grid = {
            'n_estimators': [100, 150],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }

        model = xgb.XGBRegressor(random_state=42)

        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=5,
            scoring='neg_mean_absolute_error',
            n_jobs=-1,
            verbose=1
        )

        grid_search.fit(X, y)
        return grid_search.best_estimator_

    def _baseline_model(self, data: pd.DataFrame, features: list, target: str):
        data = data.dropna(subset=[target])
        X = data[features]
        y = data[target]

        baseline = DummyRegressor(strategy='mean')
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mae_scores = cross_val_score(baseline, X, y, cv=kf, scoring='neg_mean_absolute_error')
        mae_mean = -mae_scores.mean()

        print(f"Baseline MAE (Mean Absolute Error): {mae_mean}", end='\n\n')

    def _fillna_optimized(self, data: pd.DataFrame, model, features: list, target: str) -> pd.DataFrame:
        mask = data[target].isnull()
        features_to_predict = data.loc[mask, features]
        predicted_values = model.predict(features_to_predict)
        data.loc[mask, target] = predicted_values
        return data

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        X = X[(X['baths'] < 10) | (X['baths'].isnull())]
        X = X[(X['target'] <= 40e6) & (X['target'] > 1e3)]
        X = X[(X['beds'] < 20) | (X['beds'].isnull())]
        X = X[(X['Price/sqft'] < 4e3) | (X['Price/sqft'].isnull())]
        return X
    

def round_to_nearest_half(value):
    if pd.isna(value):
            return np.nan
    return round(value * 2) / 2




In [154]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame, y=None):
        # Никаких операций обучения не требуется
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        # Создаем копию данных, чтобы избежать изменений оригинала
        X = X.copy()
        
        # Удаление аномальных данных
        X = X[(X['baths'] < 10) | (X['baths'].isnull())]
        X = X[(X['target'] <= 40e6) & (X['target'] > 1e3)]
        X = X[(X['beds'] < 20) | (X['beds'].isnull())]
        X = X[(X['Price/sqft'] < 4e3) | (X['Price/sqft'].isnull())]

        Q1 = X['sqft'].quantile(0.25)
        Q3 = X['sqft'].quantile(0.75)
        IQR = Q3 - Q1

        # Определение границ
        lower_bound = 300
        upper_bound = Q3 + 1.5 * IQR

        # Фильтрация данных
        X = X[(X['sqft'].isna()) |(X['sqft'] >= lower_bound) & (X['sqft'] <= upper_bound)]
        X.reset_index(drop=True, inplace=True)

        return X


In [155]:

class CustomFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # Этот метод может быть оставлен пустым, так как мы не обучаем модель здесь
        return self
    
    def transform(self, X, y=None):
        # Убедитесь, что X - это DataFrame
        X = X.copy()
        
        # Обработка признака 'count_fireplace'
        X['has_fireplace'] = X['count_fireplace'].apply(lambda x: 1 if x > 0 else 0)
        X.drop(['count_fireplace'], axis=1, inplace=True)
        
        # Обработка признака 'Parking'
        X['has_parking'] = X['Parking'].apply(lambda x: 1 if x > 0 else 0)
        X.drop('Parking', axis=1, inplace=True)
        
        # Обработка признака 'lowest_price'
        X['lowest_price'] = X['sqft'] * X['Price/sqft']
        X = X[(X['lowest_price'] > 5)]
        X.loc[X['lowest_price'] > 5e5, 'lowest_price'] = None
        
        # Обработка пропущенных значений в 'baths'
        X.loc[X['baths'] == 0, 'baths'] = None
        
        # Создание новых признаков
        X['sqft/baths'] = X['sqft'] / X['baths']
        X['sum_baths_beds'] = X['beds'] + X['baths']
        
        # Удаление старых признаков
        X.drop(['beds', 'baths'], axis=1, inplace=True)
        
        return X


In [156]:
class Other(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        X.drop(['lotsize', 'stories'], axis=1, inplace=True)
        return X

уходят в drop:  
Parking  
Cooling   
Heating  
lotsize  
Price/sqft  

In [157]:
r = data.copy()

In [158]:
data_pipeline = Pipeline([
    ('private_pool', PrivatePoolTransformer()), #корректно
    ('fireplace_transformer', FireplaceTransformer()), #корректно 
    ('status_transformer', StatusToNumberTransformer()), #коректно 
    ('stories_transform', StoriesTransform()), #коректно 
    ('type_transform', PropertyTypeTransformer()), #коректно 
    ('home_features', HomeFeaturesExtractor()), #корректно
    ('schools', SchoolFeaturesExtractor()),
    ('fillna_sqft', fill_sqft(['baths', 'beds'])),
    ('numbers', NumericExtractor(['baths','sqft', 'beds', 'target', 'lotsize','Year built','Remodeled year'])),
    ('parking_sqft', ParkingPriceTransformer()),
    ('stories', CustomImputerOutlierRemover()),
    ('cooling_cat', CoolingTransformer()),
    ('heating_cat', HeatingFeaturesTransformer()),
    ('other_cat',  MinHashFeaturesTransformer(columns=['city', 'street', 'state'], n_components=2)),
    ('year_features', YearFeaturesTransformer()), #d
    ('outlier_remover', OutlierRemover()),
    ('inplace_col_sqft', InplaceColTransformer(['baths', 'beds', 'PrivatePool', 'Average Rating'], 'sqft')),
    ('round_baths', FunctionTransformer(lambda X: X.assign(baths=X['baths'].apply(round_to_nearest_half)), validate=False)),
    ('inplace_col_baths', InplaceColTransformer(['sqft', 'beds', 'PrivatePool', 'Average Rating', 'property_age'], 'baths')),
    ('round_beds', FunctionTransformer(lambda X: X.assign(beds=X['beds'].apply(round_to_nearest_half)), validate=False)),
    ('inplace_col_beds', InplaceColTransformer(['sqft', 'baths', 'PrivatePool', 'Average Rating'], 'beds')),
    ('inplace_col_price', InplaceColTransformer(['propertyType', 'baths', 'state_0', 'state_1', 'city_0', 'city_1', 'Average Rating','property_age'], 'Price/sqft')),
    ('custom_features', CustomFeatureTransformer()),
    ('other', Other()),

])
data = r.copy()
data.drop_duplicates(inplace=True)
data.dropna(subset=['target', 'street', 'city'], inplace=True)
data.drop(['mls-id', 'MlsId'], axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

data_pipeline.fit(data)
pp = data_pipeline.transform(data.iloc[0:10])



  X['PrivatePool'] = X['PrivatePool'].map({'yes': True, 'Yes': True}).fillna(False).astype(bool)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Baseline MAE (Mean Absolute Error): 697.6074497944612

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Baseline MAE (Mean Absolute Error): 0.765832995700708

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Baseline MAE (Mean Absolute Error): 0.8318803837545431

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Baseline MAE (Mean Absolute Error): 123.67270692656794



KeyError: 'private pool'