In [1]:
import json
import re
import warnings  


import numpy as np
import pandas as pd


from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb


from dirty_cat import MinHashEncoder
from joblib import dump
import dill

# Игнорировать все предупреждения
warnings.filterwarnings('ignore')

In [2]:
#%%script false
main_data = pd.read_csv('data/data.csv', sep = ',')

In [3]:
data = main_data.copy()
data.shape

(377185, 18)

In [4]:
class drop_cols(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        X.drop(['MlsId', 'mls-id', 'zipcode', 'lotsize', 'stories'], axis=1, inplace=True)
        #X.drop(['MlsId', 'mls-id', 'zipcode'], axis=1, inplace=True)
        return X

In [5]:
class privatpool_features(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()

        X['PrivatePool'] = X['private pool'].combine_first(X['PrivatePool'])
        X.drop('private pool', axis=1, inplace=True)
        X['PrivatePool'] = X['PrivatePool'].map({'yes': True, 'Yes': True}).fillna(False).astype(bool)

        return X

In [6]:
class process_fireplace_features(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()

        X['count_fireplace'] = X['fireplace'].apply(lambda x: self._handle_missing(x) if isinstance(x, str) else np.nan)
        X['count_fireplace'] = X['count_fireplace'].fillna(X['fireplace'].apply(lambda x: self._extract_number(x) if isinstance(x, str) else np.nan))
        X['count_fireplace'] = X['count_fireplace'].fillna(X['fireplace'].apply(lambda x: self._count_rooms(x) if isinstance(x, str) else np.nan))
        X['count_fireplace'] = X['count_fireplace'].fillna(0)

        X['fireplace_type'] = X['fireplace'].apply(self._identify_fireplace_type)
        X.drop('fireplace', axis=1, inplace=True)

        return X
    
    def _extract_number(self, value: str):
        value = value.lower()
        tedatat_to_number = {
            'one': 1, 'two': 2, 'three': 3, 'four': 4,
            'five': 5, 'sidata': 6, 'seven': 7, 'eight': 8,
            'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12
        }
        for tedatat, num in tedatat_to_number.items():
            if tedatat in value:
                return num
        match = re.search(r'(\d+)(?=\+)?', value)
        if match:
            return int(match.group(1))
        elif 'yes' in value.lower():
            return 1
        else:
            return np.nan
    
    def _count_rooms(self, value: str):
        room_types = ['Family', 'Bedroom', 'Living', 'Den', 'Kitchen', 'Dining', 'FAMILYRM',
                      'Library', 'Study', 'Playroom', 'Recreation', 'Sitting', 'Guest', 'Office']
        count = sum(value.count(room) for room in room_types)
        return count if count > 0 else 1

    def _handle_missing(self, value: str):
        if isinstance(value, str) and value in ['N/K', 'No', 'Not Applicable', 'None', 'Non-Functional', 'Inoperative', 'Edatatra Closets']:
            return 0


    def _identify_fireplace_type(self, value: str):
        if isinstance(value, str):
            value = value.lower()
            if 'gas' in value:
                return 1
            if 'wood' in value or 'burning' in value:
                return 2
            if 'electric' in value:
                return 3
        return 0



In [7]:
class status_preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.status_groups = {
            'For Sale/Active': [
                'for sale', 'active', 'new construction', 'new', 'active under contract', 
                'active with offer', 'active/contingent', 'active - auction', 'active with contract', 
                'auction active', 'auction', 'back on market', 'listing extended', 'active option', 
                'price change', 'a active', 'temporary active'
            ],
            'Pending/Under Contract': [
                'pending', 'under contract', 'contingent', 'pending continue to show', 'p',
                'option pending', 'pending taking backups', 'under contract show', 'under contract showing', 
                'under contract backups', 'pending backup wanted', 'pending take backups', 
                'pending continue show', 'pending inspection', 'due diligence period', 'p pending sale',
                'active with contingencies', 'pending ab', 'contingent finance and inspection',
                'contingent show', 'contingent take backup', 'pf', 'under contract showing', 
                'c', 'ct', 'pending - continue to show', 'pending (do not show)',
                'pending - backup offer requested', 'pending w/backup wanted', 'option contract',
                'pending - taking backups', 'offer pending signature', 'pending fe',
                'pending w/insp finance', 'uc continue to show', 'contingency contract',
                'under contract - show', 'pending offer approval', 'contingent escape',
                'pending with contingencies', 'contingent - financing', 'contract contingent on buyer sale',
                'pending, continue to show', 'pending bring backup', 'pending w/ escape clause',
                'pending - continue to show', 'pending sh', 'pending w/ cont.', 
                'pending continue to show   financing', 'pending inspection', 
                'under contract taking back up offers', 'backup contract', 'backup',
                'contract p', 'contingency 48 hr (+/ )', 'conting accpt backups',
                'contingent release', 'contingent lien holder release', 'contingent - sale of home',
                'pending sale', 'pending - continue to show', 'contingent foreclosure','under contract   showing'
            ],
            'Foreclosure/Auction': [
                'foreclosure', 'pre-foreclosure', 'foreclosed', 'auction', 'pre-foreclosure / auction', 
                'auction - active', ' / auction', 'pending auction', 'auction - active', 'foreclosure auction'
            ],
        }

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X['for_rent'] = X['status'].apply(
        lambda x: 'rent' in str(x).lower() or 'purchase' in str(x).lower())

        X['status'] = X['status'].apply(self._status_to_number)
        return X

    def _status_to_number(self, value: str):
        if pd.isna(value):
            return 0 # Создадим отдельный статус для пропущенных значений

        value = value.lower()
        for index, (key, statuses) in enumerate(self.status_groups.items()):
            statuses_lower = [status.lower() for status in statuses]

            if value in statuses_lower:
                return index + 1

        return -999  # Возвращаем -999, если статус не найден


In [8]:
class stories_preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.word_to_num={
        "one half":0.5, "one": 1, "two": 2, "three": 3, "four": 4, 
        "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
        "duplex":2, "ground":1, "triplex":3, 'tri':3
        }


    def fit(self, X, y=None):
        return self 
    

    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        X['stories'] = X['stories'].apply(self._process_stories)
        return X
    
    def _process_stories(self, story: str):
        if pd.isna(story):
            return np.nan  

    
        stories = 0
        story = story.lower()
    
        for word, num in self.word_to_num.items():
            story = story.replace(word, str(num))
    


        numbers = re.findall(r'\d+\.?\d*', story)
        if numbers:
            stories += sum([float(num) for num in numbers]) / len(numbers)

        # будем считать наличие подвала как 0.5 этажа, как в этих данных и помечалось
        story = story.replace('-', ' ')
        if "basement" in story or "tri level" in story:
            stories += 0.5

        return stories if stories > 0 else np.nan

In [9]:
class property_type_preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = MinHashEncoder() #MinMinHashEncoder
        self.replacements = {
            r'single[\s_-]*family[\s_-]*home|single[\s_-]*family[\s_-]*': 'single_family_home',
            r'condo|condo/townhome.*|coop.*': 'condo',
            r'townhouse|row home|townhome': 'townhouse',
            r'multi[\s_-]*family': 'multi_family_home',
            r'lot/land|land|farms/ranches': 'land',
            r'mobile': 'mobile_home',
            r'apartment': 'apartment',
            r'ranch': 'ranch',
            r'contemporary|modern|contemporary/modern': 'contemporary_modern',
            r'colonial': 'colonial',
            r'traditional': 'traditional',
            r'1 story|one story': 'one_story',
            r'2 stories|two story': 'two_story',
            r'other': 'other'}

    def fit(self, X: pd.DataFrame, y=None):
        
        X = X.copy()

        X['propertyType'] = X['propertyType'].str.lower().str.strip()
        X['propertyType'] = X['propertyType'].apply(self._clean_property_type)

        # Применение замену
        for pattern, replacement in self.replacements.items():
            X['propertyType'] = X['propertyType'].replace(to_replace=pattern, value=replacement, regex=True)


        self.encoder.fit(X[['propertyType']])

        
        return self

    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X['propertyType'] = X['propertyType'].str.lower().str.strip()
        X['propertyType'] = X['propertyType'].apply(self._clean_property_type)

        # Применение замену
        for pattern, replacement in self.replacements.items():
            X['propertyType'] = X['propertyType'].replace(to_replace=pattern, value=replacement, regex=True)


        X['propertyType'] = self.encoder.transform(X[['propertyType']])
        return X


    def _clean_property_type(self, pt):
        if pd.isna(pt):
            return 'missing'  # Создадим отдельный статус для пропущенных значений

        pt = re.sub(r'\b(\w+)(_\1\b)+', r'\1', pt)
        pt = re.sub(r'\b(\w+)(/\1\b)+', r'\1', pt)
        pt = re.sub(r'\s+/|/\s+', '/', pt)
        pt = re.sub(r'\s+', '_', pt)
        pt = re.sub(r'(_{2,})', '_', pt)
        pt = re.sub(r'(_home)+', '_home', pt)
        pt = re.sub(r'(_story)+', '_story', pt)
        pt = re.sub(r'\bhome_home\b', 'home', pt)

        return pt


In [10]:
class home_features(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['Cooling', 'Heating', 'Parking', 'Price/sqft', 'Remodeled year', 'Year built', 'lotsize']

    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        if 'homeFacts' not in X.columns:
            raise ValueError("DataFrame must contain 'homeFacts' column")
    
        new_columns = X['homeFacts'].apply(self._extract_home_features)
        new_df = pd.DataFrame(new_columns.tolist())
        X.drop('homeFacts', inplace=True, axis=1)
        X = X.reset_index(drop=True)
        new_df = new_df.replace(r'^\s*$', None, regex=True)
        new_df = new_df.reset_index(drop=True)

        X = X.join(new_df)
        return X




    def _extract_home_features(self, home_facts_str: str):
        if pd.isna(home_facts_str):
            return {col: np.nan for col in self.columns}

        try:
            home_facts_str = home_facts_str.replace("'", '"')

            home_facts_str = home_facts_str.replace('"closet"-Electric', 'closet-Electric')

            home_facts_str = re.sub(r'(?<!")\bNone\b(?!")', '"None"', home_facts_str)

            home_facts_str = re.sub(r'(?<=[a-z-A-Z])"(?=[a-zA-Z])', '/', home_facts_str)

            home_facts_dict = json.loads(home_facts_str)


            facts = home_facts_dict.get('atAGlanceFacts', [])
            features = {fact['factLabel']: fact['factValue'] for fact in facts}
            return {col: features.get(col, np.nan) for col in self.columns}

        except json.JSONDecodeError as e:
            return {col: np.nan for col in self.columns}



In [11]:
class school_features(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns= ['Average Rating']

    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X.reset_index(drop=True, inplace=True)
        new_columns = X['schools'].apply(self._extract_school_features)
        new_df = pd.DataFrame(new_columns.tolist(), columns=self.columns)

        X.drop('schools', inplace=True, axis=1)
        X.reset_index(drop=True, inplace=True)
        new_df.reset_index(drop=True, inplace=True)
        X = X.join(new_df)

        return X

    def _extract_school_features(self, school_facts_str: str):
        if pd.isna(school_facts_str):
            return {
                'Average Rating': np.nan, 
            }
        try:
        
            school_facts_str = school_facts_str.replace("'", '"')
            school_facts_str = re.sub(r'(?<!")\bNone\b(?!")', '"None"', school_facts_str)
            school_facts_str = re.sub(r'NR', 'None', school_facts_str)
            school_facts_str = re.sub(r'NA', 'None', school_facts_str)
            school_facts_str = re.sub(r'(?<=[a-z-A-Z])"(?=[a-zA-Z])', '/', school_facts_str)
            school_facts_str = re.sub(r', "name": \[[^\]]*\]', '', school_facts_str)
            
            features = json.loads(school_facts_str)[0]
            data = features.get('data', []) 

            try: #отвечает за рейтинг школ
                ratings = []
                for rating in features['rating']:
                    if 'None' not in rating:
                        rating_value = rating.split('/')[0]  # Берем значение до '/'
                        ratings.append(int(rating_value))
            except:
                ratings=[]

            return {
                'Average Rating': np.mean(ratings) if ratings else np.nan,
            }
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            print(f'Problematic string: {school_facts_str}')
            return {
                'Average Rating': np.nan, 
            }
        except Exception as err: 
            print(err)


In [12]:
class fill_sqft(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols_to_check = ['baths', 'beds']

    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X = self._fill_sqft_from_columns(X, self.cols_to_check)
        return X
    
    def _fill_sqft_from_columns(self, X: pd.DataFrame, cols_to_check):
        for col in cols_to_check:
            # Ищем информацию о площади в каждом указанном столбцеdata
            X['sqft'] = X.apply(
                lambda row: self._extract_sqft(row[col]) if pd.isna(row['sqft']) else row['sqft'], 
                axis=1
            )
        return X
    
    def _extract_sqft(self, text: str):
        if pd.isna(text):
            return np.nan
        text = str(text).replace(',', '') 
        match = re.search(r'(\d+)\s?sqft', text)  # Ищем число, за которым идет 'sqft'
        if match:
            return float(match.group(1)) 
        return np.nan

In [13]:
class extract_number(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['baths','sqft', 'beds', 'lotsize','Year built','Remodeled year']

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None): 
        X = X.copy()
        
        for col in self.columns:
            if col not in X.columns:
                raise ValueError(f"DataFrame must contain column '{col}'")
            else:
                X[col] = X[col].apply(self._give_number)

        return X

    def _give_number(self, pt):
        if pd.isna(pt):
            return np.nan
        pt = str(pt).replace(',', '')
        if not pt:
            return np.nan

        match = re.search(r'\d+\.?\d*', str(pt)) 
        if match:
            return float(match.group())  
        else:
            return np.nan
    

In [14]:
class extract_number_second(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X['Parking'] = X['Parking'].apply(self._extract_number_second)
        X['Parking'] = X['Parking'].fillna(0)
        X['Price/sqft'] = X['Price/sqft'].apply(self._extract_number_second).astype(float)

        return X
    
    def _extract_number_second(self, value: str):
        if pd.isna(value):
                return np.nan  

        # Удаление пробелов и приведение к нижнему регистру
        value = value.strip().lower()

        # Проверка на отсутствие данных
        if 'no data' in value or 'no parking' in value:
            return 0

        # Поиск числа в строке
        match = re.search(r'\b(\d+)\s*spaces?\b', value)
        if match:
            return int(match.group(1))

        # Проверка на явное упоминание количества мест
        match = re.search(r'\b(\d+)\b', value)
        if match:
            return int(match.group(1))

        # Возвращаем None, если количество мест не удается определить
        return None

In [15]:
class cooling_preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = MinHashEncoder(n_components=3)
        self.cooling_patterns = {
            r'central|zoned': 'Central A/C',
            r'window|wall': 'Window/Wall Unit',
            r'heat pump': 'Heat Pump',
            r'no a/c|no cooling': 'No A/C',
            r'fan': 'Fan',
            r'geothermal': 'Geothermal',
            r'evaporative': 'Evaporative Cooler',
            r'solar': 'Solar A/C'
        }

        self.replace_mode = None

    def fit(self, X:pd.DataFrame, y=None):
        X = X.copy()
        X.reset_index(drop=True, inplace=True)

        X['Cooling_Categorized'] = X['Cooling'].apply(self._process_cooling)
        self.replace_mode = X['Cooling_Categorized'].mode()[0]

        X['Cooling_Categorized'] = X['Cooling_Categorized'].fillna(self.replace_mode)

        self.encoder.fit(X[['Cooling_Categorized']])
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X['Cooling_Categorized'] = X['Cooling'].apply(self._process_cooling)
        X['Cooling_Categorized'] = X['Cooling_Categorized'].fillna(self.replace_mode)

        encoded = self.encoder.transform(X[['Cooling_Categorized']])
        encoded_df = pd.DataFrame(encoded, columns=[f'Cooling_{i}' for i in range(encoded.shape[1])])
       
        
        X.reset_index(drop=True, inplace=True)
        encoded_df.reset_index(drop=True, inplace=True)

        X = X.join(encoded_df)
        X.drop(['Cooling','Cooling_Categorized'], axis=1, inplace=True)
        X.reset_index(drop=True, inplace=True)
        
        return X

    def _process_cooling(self, cooling_str: str):
        """
        Функция для обработки столбца 'Cooling', которая классифицирует варианты систем охлаждения.
        """
        if pd.isna(cooling_str):
                return np.nan  
        
        for pattern, category in self.cooling_patterns.items():
            if re.search(pattern, cooling_str, re.IGNORECASE):  # re.IGNORECASE делает поиск нечувствительным к регистру
                return category
       
        return 'Other'


       
     



In [16]:
class heating_preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = MinHashEncoder(n_components=2)
        self.heating_patterns = {
           'gas|natural gas|propane': 'Gas',
           'electric|electricity': 'Electric',
           'heat pump': 'Heat Pump',
           'stove|wood|pellet': 'Stove',
           'radiant|radiator': 'Radiant',
           'central': 'Central',
           'solar': 'Solar',
           'fireplace': 'Fireplace',
           'baseboard': 'Baseboard',
           'steam': 'Steam',
           'wall': 'Wall',
           'forced air': 'Forced Air'
       }


    def fit(self, X:pd.DataFrame, y=None):
        X = X.copy()
        X['Heating_Categorized'] = X['Heating'].apply(self._process_heating)
        self.encoder.fit(X['Heating_Categorized'])
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X['Heating_Categorized'] = X['Heating'].apply(self._process_heating)
        encoded = self.encoder.transform(X[['Heating_Categorized']])
        encoded_df = pd.DataFrame(encoded, columns=[f'Heating_{i}' for i in range(encoded.shape[1])])
        X.reset_index(drop=True, inplace=True)

        X = X.join(encoded_df)
        X.drop(['Heating','Heating_Categorized'], axis=1, inplace=True)

        return X


    def _process_heating(self, heating: str):
        if pd.isna(heating):
            return np.nan  

        # Проверяем каждое регулярное выражение и возвращаем соответствующий тип
        for pattern, heating_type in self.heating_patterns.items():
            if re.search(pattern, heating, re.IGNORECASE):
                return heating_type
        return 'Other'
    

In [17]:
class OtherCat(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {
            'city': MinHashEncoder(n_components=2),
            'street': MinHashEncoder(n_components=2),
            'state': MinHashEncoder(n_components=2)
        }

    def fit(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        for key, encoder in self.encoders.items():
            encoder.fit(X[key].values.reshape(-1, 1))
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()

        for key, encoder in self.encoders.items():
            encoded = encoder.transform(X[key].values.reshape(-1, 1))
            encoded_df = pd.DataFrame(encoded, columns=[f'{key}_{i}' for i in range(encoded.shape[1])])
            X = X.join(encoded_df).drop(columns=[key])  # Удаляем исходный столбец
        return X

In [18]:
class select_year(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.year_mode = None

    def fit(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        self.year_mode = X['Year built'].mode()[0]
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()

        X['Year built'] = X['Year built'].mask((X['Year built'] < 1850) | (X['Year built'] > 2024), None)
        X['Year built'] = X['Year built'].fillna(self.year_mode)

        X['is_remodeled'] = (X['Remodeled year'] > X['Year built']).fillna(False).astype(bool)
        X['property_age'] = 2024 - X['Year built']

        X.drop(['Year built', 'Remodeled year'], axis=1, inplace=True)

        return X
        




In [19]:
class find_anomaly(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.lower_bound_sqft = 300
        self.upper_bound_sqft = None

    def fit(self, X: pd.DataFrame, y=None):
        X = X.copy()

        Q1 = X['sqft'].quantile(0.25)
        Q3 = X['sqft'].quantile(0.75)
        IQR = Q3 - Q1
        self.upper_bound_sqft = Q3 + 1.5 * IQR

        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        X['baths'] = X['baths'].mask(X['baths'] >= 10, None)
        X['beds'] = X['beds'].mask(X['beds'] >= 20, None)
        X['Price/sqft'] = X['Price/sqft'].mask(X['Price/sqft'] >= 4e3, None)
        X['sqft'] = X['sqft'].mask(((X['sqft'] < self.lower_bound_sqft) |
                                    (X['sqft'] > self.upper_bound_sqft)), None)
        
        return X


In [20]:
class anomaly_replacement(BaseEstimator, TransformerMixin):

    def __init__(self):

        self.target_columns = ['sqft', 'baths', 'beds', 'Price/sqft']

        self.feature_columns = {
            self.target_columns[0]: ['baths', 'beds', 'PrivatePool', 'Average Rating'],
            self.target_columns[1]: ['sqft', 'beds', 'PrivatePool', 'Average Rating', 'property_age'],
            self.target_columns[2]: ['sqft', 'baths', 'PrivatePool', 'Average Rating'],
            self.target_columns[3]: ['propertyType', 'baths', 'state_0', 'state_1', 'city_0', 'city_1', 'Average Rating','property_age']
        }
        
        params = {
            'colsample_bytree': 1.0,
            'learning_rate': 0.2,
            'max_depth': 7,
            'n_estimators': 150,
            'subsample': 1.0,
            'random_state': 42
        }

        self.models = {col: xgb.XGBRegressor(**params) for col in self.target_columns}


    def fit(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        for target_col in self.target_columns:
            
            #если нет хотя бы 1 столбца - выдаём ошибку
            required_columns = [target_col] + self.feature_columns[target_col]
            if not set(required_columns).issubset(X.columns): 
                raise ValueError('Нет столбца для обучения модели')
            
            df = X.loc[:, required_columns]
            Xy = df.dropna(subset=[target_col]).reset_index(drop=True)

            X_filtred = Xy.drop(target_col, axis=1)
            y_filtred = Xy[target_col]

            #обучение соответсвующих моделей
            self.models[target_col].fit(X_filtred, y_filtred)

        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        X = X.copy()
        
        for target_col in self.target_columns:
            required_columns = [target_col] + self.feature_columns[target_col]
            if not set(required_columns).issubset(X.columns): 
                raise ValueError('Нет столбца для обучения модели')
            
            mask = X[target_col].isnull()
            features_to_predict = X.loc[mask, self.feature_columns[target_col]]
            predicted = self.models[target_col].predict(features_to_predict)

            X.loc[mask, target_col] = predicted


        return X




      



In [21]:
class other(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X: pd.DataFrame, y = None):
        X['has_fireplace'] = X['count_fireplace'].apply(lambda x: 1 if x > 0 else 0)
        X.drop(['count_fireplace'], axis=1, inplace=True)
        
        X['has_parking'] = X['Parking'].apply(lambda x: 1 if x > 0 else 0)
        X.drop('Parking', axis=1, inplace=True)
        
        X['lowest_price'] = X['sqft'] * X['Price/sqft']
        X = X[(X['lowest_price'] > 5)]
        X.loc[X['lowest_price'] > 5e5, 'lowest_price'] = None
        
        X['sum_baths_beds'] = X['beds'] + X['baths']
        
        X.loc[X['baths'] == 0, 'baths'] = None
        X['sqft/baths'] = X['sqft'] / X['baths']

        X.drop(['beds', 'baths'], axis=1, inplace = True)

        return X

In [22]:
class PASS(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self 
    
    
    def transform(self, X, y=None):
        return X 
    

In [23]:
preprocessing_steps = {
    'features': [
        ('privat_pool', privatpool_features()),
        ('fireplace', process_fireplace_features()),  
        ('status', status_preprocessing()),  
        ('stories', stories_preprocessing()),  
        ('property_type', property_type_preprocessing()),  
    ],
    'home_features': [
        ('home', home_features()),  
        ('school', school_features()),  
    ],
    'number_processing': [
        ('sqft', fill_sqft()), 
        ('extract_number', extract_number()), 
        ('extract_number_second', extract_number_second()), 
    ],
    'climate_features': [
        ('cooling', cooling_preprocessing()),  
        ('heating', heating_preprocessing()),  
    ],
    'categorical_processing': [
        ('OtherCat', OtherCat()),  
    ],
    'anomaly_detection': [
        ('year', select_year()), 
        ('anomaly', find_anomaly()),  
        ('anomaly_replacement', anomaly_replacement()),  
    ],
    'finalization': [
        ('other', other()),
        ('drop_columns', drop_cols()),  
        ('final', PASS())  # функция пустышка
    ]
}

pipeline = Pipeline([
    *preprocessing_steps['features'],
    *preprocessing_steps['home_features'],
    *preprocessing_steps['number_processing'],
    *preprocessing_steps['climate_features'],
    *preprocessing_steps['categorical_processing'],
    *preprocessing_steps['anomaly_detection'],
    *preprocessing_steps['finalization']
])

data = data.drop('target', axis=1)
pipeline.fit(data)

In [24]:

with open('model/pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

In [25]:
row = data.iloc[[0]].reset_index(drop=True)
change_row = pipeline.transform(row)

change_data = pipeline.transform(data)

In [26]:
change_data = pipeline.transform(data)

In [27]:
change_row

Unnamed: 0,status,propertyType,sqft,PrivatePool,fireplace_type,for_rent,Price/sqft,Average Rating,Cooling_0,Cooling_1,...,street_1,state_0,state_1,is_remodeled,property_age,has_fireplace,has_parking,lowest_price,sum_baths_beds,sqft/baths
0,1,-2135328000.0,2900.0,False,1,False,144.0,5.2,-2083815000.0,-1950527000.0,...,-1982907000.0,-1462146000.0,-1966426000.0,False,5.0,1,0,417600.0,7.5,828.571429


In [28]:
change_data.head(2)

Unnamed: 0,status,propertyType,sqft,PrivatePool,fireplace_type,for_rent,Price/sqft,Average Rating,Cooling_0,Cooling_1,...,street_1,state_0,state_1,is_remodeled,property_age,has_fireplace,has_parking,lowest_price,sum_baths_beds,sqft/baths
0,1,-2135328000.0,2900.0,False,1,False,144.0,5.2,-2083815000.0,-1950527000.0,...,-1982907000.0,-1462146000.0,-1966426000.0,False,5.0,1,0,417600.0,7.5,828.571429
1,1,-2135328000.0,1947.0,False,0,False,159.0,4.0,-2083815000.0,-1950527000.0,...,-2031473000.0,278272600.0,-1820335000.0,False,5.0,0,0,309573.0,6.0,649.0
