In [1]:
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import random
from datetime import datetime
import pgeocode
import cupy as cp

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
df = pd.read_csv('real_estate_raw.csv')

  df = pd.read_csv('real_estate_raw.csv')


In [3]:
cols = [
    'original_list_price',
    'list_price',
    'close_price',
    'association_fee',
    'tax_annual_amount',
    'days_on_market',
    'cumulative_days_on_market',
    'previous_list_price',
    'living_area',
    'lot_size_acres',

    'rooms_total',
    'bedrooms_total',
    'bathrooms_full',
    'bathrooms_half',
    'garage_spaces',
 
    'year_built',
    
    'postal_code',

    'elementary_school_district',
    'middle_or_junior_school_district',
    'high_school_district',
    
    'accessibility_features',
    'heating',
    'water_source',
    'sewer',
    'lot_features',
    'roof',
    'community_features',
    'laundry_features',
    'cooling',
    'association_fee_includes',
    'mrd_din',
    'mrd_ext',
    'mrd_fireplace_location',
    'ownership',
    'mrd_bas',
    'mrd_pkn',
    
    'waterfront_yn',
    'mrd_disability_access',
    'mrd_garage_onsite',
    'new_construction_yn',
        
    'mrd_rehab_year',
        
    'mrd_tnu',
    'mrd_tpc',
    'mrd_tpe'
]

In [4]:
df = df[cols]

In [5]:
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400918 entries, 0 to 400917
Data columns (total 44 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   original_list_price               400918 non-null  float64
 1   list_price                        400918 non-null  float64
 2   close_price                       400918 non-null  float64
 3   association_fee                   400918 non-null  float64
 4   tax_annual_amount                 400918 non-null  float64
 5   days_on_market                    400918 non-null  int64  
 6   cumulative_days_on_market         400918 non-null  int64  
 7   previous_list_price               400918 non-null  int64  
 8   living_area                       400918 non-null  float64
 9   lot_size_acres                    400918 non-null  float64
 10  rooms_total                       400918 non-null  int64  
 11  bedrooms_total                    400918 non-null  i

In [6]:
class DataPreprocess:
    def __init__(self):
        self.all_schools_dict = {}
        self.ohe_encoder_dict = {}
        self.school_binarizer = {}
        self.feature_binarizer = {}
        self.numeric_value_normalizer = None
        self.numeric_target_normalizer = None
        self.feature_list =['accessibility_features',
                                'heating',
                                'water_source',
                                'sewer',
                                'lot_features',
                                'roof',
                                'community_features',
                                'laundry_features',
                                'cooling',
                                'association_fee_includes',
                                'mrd_din',
                                'mrd_ext',
                                'mrd_fireplace_location',
                                'ownership',
                                'mrd_bas',
                                'mrd_pkn',
                                'mrd_tpc',
                                'mrd_tpe']
        self.school_features = ['elementary_school_district','middle_or_junior_school_district','high_school_district']
        self.room_features = ['rooms_total','bedrooms_total','bathrooms_full','bathrooms_half','garage_spaces']
        self.all_multi_binarized_features = []
        self.all_postal_binarized_features = []
        self.all_boolean_features = []
        self.all_continuous_features = ['original_list_price',
                                        'list_price',
                                        'association_fee',
                                        'tax_annual_amount',
                                        'days_on_market',
                                        'cumulative_days_on_market',
                                        'previous_list_price',
                                        'living_area',
                                        'lot_size_acres']
        

    def clean_schools(self, df, dataset='train', schools=None):
        ## find missing school districts by zip code ##
        if schools is None:
            schools = self.school_features
        
        if dataset == 'train':
            before = df.elementary_school_district.isna().sum(),df.middle_or_junior_school_district.isna().sum(),df.high_school_district.isna().sum()
            
            for dist in schools:
                df[dist] = df[dist].apply(lambda x: str(x).lower() if pd.notnull(x) else x)
                
            for dist in schools:
                df_school = df[['postal_code',dist]].copy()
                df_school = df_school.dropna(subset=[dist],how='all')
                
                df_school = df_school.groupby(["postal_code",dist]).size().reset_index().rename(columns={0:'count'})
                school_dict = dict(zip(df_school.postal_code, df_school[dist]))
                self.all_schools_dict[dist] = school_dict
            
                df[dist] = df[dist].fillna(df.postal_code.map(school_dict))
            
            after = df.elementary_school_district.isna().sum(),df.middle_or_junior_school_district.isna().sum(),df.high_school_district.isna().sum()
            print(f'Reduced district nulls from {before} to {after} after.')
            df[schools] = df[schools].fillna(0)
            
        elif dataset == 'predict':
            before = df.elementary_school_district.isna().sum(),df.middle_or_junior_school_district.isna().sum(),df.high_school_district.isna().sum()
            for dist in schools:
                df[dist] = df[dist].apply(lambda x: str(x).lower() if pd.notnull(x) else x)
                
            for dist in schools:
                school_dict = self.all_schools_dict[dist]
                df[dist] = df[dist].fillna(df.postal_code.map(school_dict))

            after = df.elementary_school_district.isna().sum(),df.middle_or_junior_school_district.isna().sum(),df.high_school_district.isna().sum()
            print(f'Reduced district nulls from {before} to {after} after.') 
            
            df[schools] = df[schools].fillna(0)

        return df
            
    def binarize_schools(self, df, dataset='train', schools=None):   
        ## binarize school districts transform ##
        if schools is None:
            schools = self.school_features

        binarized_features = []
        
        if dataset == 'train':
            
            for dist in schools:
                df_dist = df[dist].copy()
                encoder = BinaryEncoder(cols=[dist])
                df_binarize = encoder.fit_transform(df_dist)
                df = pd.concat([df,df_binarize],axis=1)
                df = df.drop(dist,axis=1)
                binarized_features.extend(encoder.feature_names_out_)
                self.school_binarizer[dist] = encoder
                
        elif dataset == 'predict':
            
            for dist in schools:
                df_dist = df[dist].copy()
                df_binarize = self.school_binarizer[dist].transform(df_dist)
                df = pd.concat([df,df_binarize],axis=1)
                df = df.drop(dist,axis=1)

        self.all_postal_binarized_features.extend(binarized_features)
        df[binarized_features] = df[binarized_features].astype('bool')
        
        return df 

    def clean_age(self, df):
        ## building age transform ##
        current_year = datetime.now().year
        
        df['age'] = df['year_built'].apply(lambda x: current_year-int(x) if int(x) > 0 else x)
        bins = [-1, 0, 10, 20, 30, 40, 50, 60, 70, 80, 1000]
        labels = ['0','1-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']
        df['age_label'] = pd.cut(df.age, bins=bins, labels=labels)
        df = df.drop('age',axis=1)

        df["age_label"] = df["age_label"].astype("category")
        
        return df

    def clean_booleans(self, df):
        ## boolean fields transform ##
        df['waterfront_yn'] = df['waterfront_yn'].fillna(False)
        df['new_construction_yn'] = df['new_construction_yn'].fillna(False)
        df['mrd_disability_access'] = df['mrd_disability_access'].fillna('No')
        df['mrd_disability_access'] = pd.Series(np.where(df.mrd_disability_access.values == 'Yes', True, False), df.index)
        df['mrd_garage_onsite'] = df['mrd_garage_onsite'].fillna('No')
        df['mrd_garage_onsite'] = pd.Series(np.where(df.mrd_garage_onsite.values == 'Yes', True, False), df.index)
        df['mrd_rehab_year'] = df['mrd_rehab_year'].apply(lambda x: True if pd.notnull(x) else False)

        boolean = ['waterfront_yn','new_construction_yn','mrd_disability_access','mrd_garage_onsite','mrd_rehab_year']
        df[boolean] = df[boolean].astype('bool')

        return df

    def clean_room_features(self, df, room_list=None):
        ## rooms binning transform ##
        if room_list is None:
            room_list = self.room_features
        
        for feature in room_list:
            min_val = -1
            zero_val = 0
            first_quant = 1
            second_quant = 2
            third_quant = 3
            last_quant = 4
            extra_quant = 5
            max_val = 1000
            bins = [min_val,zero_val,first_quant,second_quant,third_quant,last_quant,extra_quant,max_val]
            labels = [f'{zero_val}',f'{first_quant}',f'{second_quant}',f'{third_quant}',f'{last_quant}',f'{extra_quant}',f'{extra_quant}+']
            df[feature] = pd.cut(df[feature], bins=bins, labels=labels, duplicates='drop')

        df[room_list] = df[room_list].astype('category')
            
        return df

    def clean_postal_codes(self, df):
        ## postal code to long lat coordinates ##
        geolocator = pgeocode.Nominatim('US')
        geocode_dict = {}
        for postal_code in list(df.postal_code.unique()):
            location = geolocator.query_postal_code(str(postal_code))
            geocode_dict[postal_code] = (location.longitude,location.latitude)
        
        df['postal_long'] = df['postal_code'].apply(lambda x: geocode_dict[x][0] if ((pd.notnull(x)) and (x in geocode_dict)) else np.nan)
        df['postal_lat'] = df['postal_code'].apply(lambda x: geocode_dict[x][1] if ((pd.notnull(x)) and (x in geocode_dict)) else np.nan)
        
        df.postal_long = df.postal_long.fillna(0)
        df.postal_lat = df.postal_lat.fillna(0)

        df[['postal_long','postal_lat']] = df[['postal_long','postal_lat']].astype('float32')
        add_list = ['postal_long','postal_lat']
        for el in add_list:
            if el not in self.all_continuous_features:
                self.all_continuous_features.extend(['postal_long','postal_lat'])

        return df

    def clean_multi_label_features(self, df, feature_list=None):
        ## splitting of features into individual lists per cell for labelizer ##
        if feature_list is None:
            feature_list = self.feature_list
            
        for feature in feature_list:
            df[feature] = df[feature].fillna(f'None_{feature}')
            curr_len=(len(df[feature].str.split(',\s*').explode().unique().tolist()))
            print(f'{feature} has {curr_len} unique categories.')
            df[feature] = df[feature].str.replace('[{}"]','',regex=True).str.split(',\s*')

        return df

    def multi_label_binarizer(self, df, dataset='train', feature_list=None):
        
        if feature_list is None:
            feature_list = self.feature_list

        if dataset == 'train':
            for feature in feature_list:
                mlb = MultiLabelBinarizer()
                df = df.join(
                        pd.DataFrame(
                                mlb.fit_transform(df.pop(feature)),
                                index=df.index,
                                columns=mlb.classes_), lsuffix=f'{feature}_')
                self.feature_binarizer[feature] = mlb
                self.all_multi_binarized_features.extend(list(mlb.classes_))
                
        elif dataset == 'predict':
            for feature in feature_list:
                mlb = self.feature_binarizer[feature]
                df = df.join(
                        pd.DataFrame(
                                mlb.transform(df.pop(feature)),
                                index=df.index,
                                columns=mlb.classes_), lsuffix=f'{feature}_')

        

        for col in self.all_multi_binarized_features:
            df[col] = df[col].astype('bool')
        
        return df

    def normalize_numeric(self, df, dataset='train', numerical = None):
        ## taking numeric continous features and normalizing them using a min max scaler ##
        if numerical is None:
            numerical = self.all_continuous_features
            
        if dataset == 'train':
            print(df[numerical].columns)
            scaler = MinMaxScaler()
            df[numerical] = scaler.fit_transform(df[numerical])
            self.numeric_value_normalizer = scaler
            
        elif dataset == 'predict':
            print(df[numerical].columns)
            scaler = self.numeric_value_normalizer
            df[numerical] = scaler.transform(df[numerical])

        df[numerical] = df[numerical].astype('float32')
            
        return df

    def normalize_target(self, df, dataset='train',target=None):
        ## taking the target feature and normalizing it using a min max scaler ##
        if target is None:
            target = 'close_price'
            
        if dataset == 'train':
            target_scaler = MinMaxScaler()
            df[target] = target_scaler.fit_transform(df[target].to_numpy().reshape(-1, 1))
            self.numeric_target_normalizer = target_scaler
        elif dataset == 'predict':
            target_scaler = self.numeric_target_normalizer
            df[target] = target_scaler.transform(df[target].to_numpy().reshape(-1, 1))

        df[target] = df[target].astype('float32')

        return df

    def ohe_categories(self, df, dataset='train', categorical=None):

        if categorical is None:
            categorical = list(df.select_dtypes(include=['category']).columns)
    
        if dataset == 'train':
        
            for cat in categorical:
                df_ohe = df[cat].values.reshape(-1, 1)
                enc = OneHotEncoder(handle_unknown = "ignore").fit(df_ohe)
                df_ohe = enc.transform(df_ohe).toarray()
                df_ohe = pd.DataFrame(df_ohe, columns=enc.categories_)
                df_ohe = df_ohe.add_prefix(f'{cat}_')
                self.all_boolean_features.extend(list(df_ohe.columns))
                df = pd.concat([df,df_ohe], axis=1)
                df = df.drop(cat,axis=1)
                self.ohe_encoder_dict[cat] = enc

        elif dataset == 'predict':
        
            for cat in categorical:
                df_ohe = df[cat].values.reshape(-1, 1)
                enc = self.ohe_encoder_dict[cat]
                df_ohe = enc.transform(df_ohe).toarray()
                df_ohe = pd.DataFrame(df_ohe, columns=enc.categories_)
                df_ohe = df_ohe.add_prefix(f'{cat}_')
                df = pd.concat([df,df_ohe], axis=1)
                df = df.drop(cat,axis=1)
                self.ohe_encoder_dict['cat'] = enc

        for col in self.all_boolean_features:
            df[col] = df[col].astype('bool')
        
        return df
            
        

In [13]:
y = df[['close_price']]
X = df.drop('close_price',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [14]:
dp = DataPreprocess()

def preprocess_train(df):
    df = dp.clean_schools(df,dataset='train')
    df = dp.binarize_schools(df,dataset='train')
    df = dp.clean_age(df)
    df = dp.clean_booleans(df)
    df = dp.clean_room_features(df)
    df = dp.clean_postal_codes(df)
    df = dp.clean_multi_label_features(df)
    df = dp.multi_label_binarizer(df,dataset='train')
    # df = dp.normalize_numeric(df,dataset='train')
    # df = dp.ohe_categories(df,dataset='train')
    return df
X_train = preprocess_train(X_train)

Reduced district nulls from (1830, 1857, 1929) to (614, 639, 638) after.
accessibility_features has 228 unique categories.
heating has 85 unique categories.
water_source has 27 unique categories.
sewer has 29 unique categories.
lot_features has 213 unique categories.
roof has 33 unique categories.
community_features has 66 unique categories.
laundry_features has 45 unique categories.
cooling has 57 unique categories.
association_fee_includes has 85 unique categories.
mrd_din has 7 unique categories.
mrd_ext has 35 unique categories.
mrd_fireplace_location has 16 unique categories.
ownership has 6 unique categories.
mrd_bas has 25 unique categories.
mrd_pkn has 4 unique categories.
mrd_tpc has 30 unique categories.
mrd_tpe has 18 unique categories.


In [15]:
X_train.head()

Unnamed: 0,original_list_price,list_price,association_fee,tax_annual_amount,days_on_market,cumulative_days_on_market,previous_list_price,living_area,lot_size_acres,rooms_total,bedrooms_total,bathrooms_full,bathrooms_half,garage_spaces,year_built,postal_code,waterfront_yn,mrd_disability_access,mrd_garage_onsite,new_construction_yn,mrd_rehab_year,mrd_tnu,elementary_school_district_0,elementary_school_district_1,elementary_school_district_2,elementary_school_district_3,elementary_school_district_4,elementary_school_district_5,elementary_school_district_6,elementary_school_district_7,elementary_school_district_8,middle_or_junior_school_district_0,middle_or_junior_school_district_1,middle_or_junior_school_district_2,middle_or_junior_school_district_3,middle_or_junior_school_district_4,middle_or_junior_school_district_5,middle_or_junior_school_district_6,middle_or_junior_school_district_7,middle_or_junior_school_district_8,high_school_district_0,high_school_district_1,high_school_district_2,high_school_district_3,high_school_district_4,high_school_district_5,high_school_district_6,high_school_district_7,high_school_district_8,age_label,postal_long,postal_lat,Accessible Bedroom,Accessible Central Living Area,Accessible Closets,Accessible Common Area,Accessible Doors,Accessible Electrical and Environmental Controls,Accessible Elevator Installed,Accessible Entrance,Accessible Full Bath,Accessible Hallway(s),Accessible Kitchen,Accessible Kitchen Appliances,Accessible Stairway,Accessible Washer/Dryer,Accessible for Hearing-Impairment,Adaptable Bathroom Walls,Adaptable For Elevator,Bath Grab Bars,Ceiling Track,Central Living Area,Common Arealaundry_features_,Customized Wheelchair Accessible,Disabled Parking,Door Width 32 Inches or More,Electronic Environmental Controls,Enhanced Accessible,Entry Slope less than 1 foot,Exterior Wheelchair Lift,Flashing Doorbell,Flooring Modifications,Grab Bars Throughout,Grip-Accessible Features,Hall Width 36 Inches or More,Hearing Modifications,Kitchen Modifications,Lever Door Handles,Low Bathroom Mirrors,Low Closet Rods,Low Pile Carpeting,Lower Fixtures,Lowered Light Switches,Main Level Entry,Modified Wall Outlets,No Interior Steps,None_accessibility_features,Otherheating_,Other Main Level Modifications,Pocket Door(s),Ramp - Main Level,Reinforced Floors,Roll-In Shower,Safe Emergency Egress from Home,Smart Technology,Stair Lift,Standby Generator,Swing In Door(s),Therapeutic Whirlpool,Thresholds less than 5/8 of an inch,Two or More Access Exits,Vehicle Transfer Area,Visitable,Visitor Bathroom,Walker-Accessible Stairs,Wheelchair Accessible,Wheelchair Adaptable,Wheelchair Height Mailbox,Wheelchair Height Shelves,Wheelchair Modifications,Wheelchair Ramp(s),Baseboard,Coal,Electriccooling_,Floor Furnace,Forced Air,Geothermalcooling_,Gravity Air,Heat Pump,Indv Controls,Natural Gas,Nonelot_features_,None_heating,Oil,Otherwater_source_,Propane,Radiant,Radiator(s),Sep Heating Systems - 2+,Solar,Space Heater,Steam,Wood,Zonedcooling_,Community Well,Company Well,Lake Michigan,None_water_source,Othersewer_,Private,Private Well,Public,Shared Well,Holding Tank,None_sewer,Otherlot_features_,Overhead Sewers,Public Sewer,Septic Shared,Septic-Mechanical,Septic-Private,Sewer-Storm,Adjoins Government Land,Backs to Open Grnd,Backs to Public GRND,Backs to Trees/Woods,Beach,Chain Link Fence,Chain of Lakes Frontage,Channel Front,Common Grounds,Corner Lot,Creek,Cul-De-Sac,Dimensions to Center of Road,Dockcommunity_features_,Electric Fence,Fence-Invisible Pet,Fenced Yard,Forest Preserve Adjacent,Garden,Golf Course Lot,Horses Allowed,Infill Lot,Irregular Lot,Lake Access,Lake Front,Landscaped,Legal Non-Conforming,Level,Mature Trees,Nature Preserve Adjacent,Nonelaundry_features_,None_lot_features,Otherroof_,Outdoor Lighting,Paddock,Park Adjacent,Partial Fencing,Pasture,Pie Shaped Lot,Pond(s),Rear of Lot,River Front,Sidewalkscommunity_features_,Sloped,Spring(s),Stream(s),Streetlights,Views,Water Garden,Water Rightscommunity_features_,Water View,Waterfront,Wetlands adjacent,Wood Fence,Wooded,Woven Wire Fence,Asphalt,Flatmrd_tpc_,Metal,None_roof,Othercommunity_features_,Rubber,Shake,Slatemrd_ext_,Tar and Gravel,Tile,Airport/Runway,Clubhouseassociation_fee_includes_,Curbs,Dock,Gated,Horse-Riding Area,Horse-Riding Trails,Lake,None_community_features,Othercooling_,Park,Poolassociation_fee_includes_,Sidewalks,Stable(s),Street Lights,Street Paved,Tennis Court(s),Water Rights,Common Area,Electric Dryer Hookup,Gas Dryer Hookup,In Bathroom,In Garage,In Kitchen,In Unit,Laundry Chute,Laundry Closet,Multiple Locations,Nonecooling_,None_laundry_features,Sink,Central Air,Dual,Electric,Gasassociation_fee_includes_,Geothermal,High Efficiency (SEER 14+),Noneassociation_fee_includes_,None_cooling,Otherassociation_fee_includes_,Partial,Power Roof Vents,Roof Turbine(s),Space Pac,Window/Wall Unit - 1,Window/Wall Units - 2,Window/Wall Units - 3+,Zoned,Air Conditioning,Clubhouse,Doorman,Electricity,Exercise Facilities,Exterior Maintenance,Gas,Heat,Insurance,Internet,Lake Rights,Lawn Care,Nonemrd_din_,None_association_fee_includes,Othermrd_ext_,Parking,Pool,Scavenger,Security,Snow Removal,TV/Cable,Taxes,Water,Combined w/ FamRm,Combined w/ LivRm,Kitchen/Dining Combo,L-shaped,Nonemrd_fireplace_location_,None_mrd_din,Separate,Aluminum Siding,Asbestos Siding,Block,Brick,Brick Veneer,Brick Veneer Decrtv,Brick/Stone Msn Pred,Brk/Stn Veneer Frnt,Cedar,Clad Trim,Combination,Concrete,Conventional,EIFS (e.g. Dryvit).,Fiber Cement,Fir,Fl Brick/Stn Veneer,Frame,Glass,Insulation Brick,Limestone,Log,Marble/Granite,Masonite,None_mrd_ext,Othermrd_fireplace_location_,Shakes,Shingle Siding,Slate,Steel Siding,Stone,Stucco,Tilt Wall,Vinyl Siding,Wood Siding,Basement,Bedroom,Den/Library,Dining Room,Exterior,Family Room,Grand Entry Hall,Great Room,Hearth Room,Kitchen,Living Room,Loft,Master Bedroom,Nonemrd_bas_,None_mrd_fireplace_location,Othermrd_bas_,Co-op,Condomrd_tpc_,Fee Simple,Fee Simple w/ HO Assn.,Leasehold,None_ownership,8 ft + pour,9 ft + pour,Bathroom Rough-In,Cellar,Concrete (Basement),Concrete Block,Crawl,Daylight,Egress Window,Exterior Access,Finished,Lookout,Nonemrd_pkn_,None_mrd_bas,Othermrd_tpc_,Partially Finished,Rec/Family Area,Roughed-In Fireplace,Slab,Sleeping Area,Stone/Rock,Storage Space,Sub-Basement,Unfinished,Walk-Up Access,Garage,None,None_mrd_pkn,Space/s,1/2 Duplex,Cluster,Condo,Condo-Duplex,Condo-Loft,Corridor,Courtyard,Flat,Garden Complex,Garden Unit,Ground Level Ranch,High Rise (7+ Stories),Low Rise (1-3 Stories),Manor Home/Coach House/Villa,Mid Rise (4-6 Stories),None_mrd_tpc,Othermrd_tpe_,Penthouse,Quad-2 Story,Quad-Penthouse,Quad-Ranch,Quad-Split Level,Split Levelmrd_tpe_,Studio,T3-Townhouse 3+ Stories,Townhouse,Townhouse-2 Story,Townhouse-Ranch,Townhouse-TriLevel,Vintage,1 Story,1.5 Story,2 Stories,2.5 Story,3 Stories,4+ Stories,Coach House,Earth,Hillside,Manufactured,Modular,Multi-Level,None_mrd_tpe,Other,Raised Ranch,Split Level,Split Level w/ Sub,Tear Down
0,242500.0,230000.0,0.0,3100.0,12,12,242500,1232.0,0.0861,5+,2,1,0,1,1928,60707,False,False,True,False,False,,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,80+,-87.818497,41.923199,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,0,False,False,False,False,0,False,False,0,False,False,False,False,False,False,True,False,0,False,False,True,False,0,False,False,False,False,False,False,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,0,False,False,False,False,False,False,True,0,False,False,0,False,False,0,False,False,False,0,False,False,False,False,False,False,False,0,True,0,True,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,0,False,False,False,False,False,0,False,False,1,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,0,False,False,False,False,False,False,False,False,False,False,False,False,0,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,325000.0,325000.0,0.0,5191.08,39,93,0,1052.0,0.1823,5+,3,2,0,2,1956,60515,False,False,True,False,False,,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,60-70,-88.013802,41.803398,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,True,0,False,False,False,True,0,False,False,0,False,False,False,False,False,False,False,False,0,False,False,True,False,0,False,False,False,False,False,False,0,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,0,False,False,False,False,False,False,False,0,False,True,0,False,False,0,False,False,False,0,False,False,False,False,False,False,True,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,False,True,False,False,0,False,False,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,0,False,False,False,False,False,False,False,False,False,True,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
2,125000.0,125000.0,0.0,2838.6,9,9,0,0.0,0.0,5+,3,1,0,3,0,60961,False,False,True,False,False,,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,True,0,-88.2089,41.100498,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,True,0,False,False,False,True,0,False,False,0,False,False,False,False,False,False,False,False,0,False,False,False,False,0,False,False,True,False,False,False,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,0,False,False,False,False,False,False,True,0,False,False,0,False,False,0,False,False,False,0,False,False,False,False,False,False,True,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,False,True,False,False,0,False,False,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,0,False,False,False,False,False,False,False,False,False,True,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,343740.0,343740.0,150.0,485.0,71,71,0,2180.0,0.3423,5+,4,3,0,3,2022,61008,False,False,True,True,False,,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,1-10,-88.850899,42.259499,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,True,0,False,False,False,False,0,False,False,0,False,False,False,False,False,False,False,False,0,False,False,False,False,0,False,False,True,False,False,True,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,0,False,False,False,False,False,False,False,0,False,True,0,False,False,0,False,False,False,0,False,False,False,False,False,False,True,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,False,True,False,False,0,False,False,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,0,False,False,False,False,False,False,False,False,False,False,False,False,0,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,390000.0,390000.0,0.0,8339.44,4,4,0,2254.0,0.241,5+,4,2,1,2,1987,60031,False,False,True,False,False,,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,30-40,-87.945198,42.366901,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,True,0,False,False,False,True,0,False,False,0,False,False,False,False,False,False,False,False,0,False,False,False,False,0,False,False,True,False,False,False,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,True,0,False,False,False,False,False,False,False,False,False,0,False,False,False,False,False,False,0,False,False,False,False,False,False,True,0,False,False,0,False,False,0,False,False,False,0,False,False,False,False,False,False,True,0,False,0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0,False,True,True,False,False,0,False,False,0,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,0,False,False,False,False,False,False,False,False,False,False,False,False,0,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0,False,0,False,0,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,0,False,0,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0,False,False,False,False,False,0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [26]:
X_train.groupby(X_train.postal_code)[['list_price','tax_annual_amount','living_area']].median().head(15)

Unnamed: 0_level_0,list_price,tax_annual_amount,living_area
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6108,129900.0,1831.66,1496.0
32566,340000.0,1282.75,1468.0
33967,350000.0,2010.0,960.0
34289,489900.0,6341.0,1450.0
46075,324900.0,2998.0,2613.0
46168,335000.0,1928.0,2856.0
46303,379000.0,2634.0,2133.0
46304,334900.0,1897.0,1465.5
46307,369900.0,2270.0,2263.0
46310,205000.0,1330.0,1639.0


In [25]:
X_train.groupby(X_train.postal_code)[['list_price','tax_annual_amount']].mean().head(15)

Unnamed: 0_level_0,list_price,tax_annual_amount
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1
6108,129900.0,1831.66
32566,340000.0,1282.75
33967,350000.0,2010.0
34289,489900.0,6341.0
46075,324900.0,2998.0
46168,335000.0,1928.0
46303,410331.866667,3100.081333
46304,578400.0,2938.636
46307,388691.568627,2843.067843
46310,205000.0,1330.0


In [43]:
def preprocess_target_train(df):
    df = dp.normalize_target(df,dataset='train')
    return df
    
y_train = preprocess_target_train(y_train)

In [44]:
y_train.head()

Unnamed: 0,close_price
0,0.007833
1,0.003194
2,0.002083
3,0.010072
4,0.005


In [45]:
def preprocess_test(df):
    df = dp.clean_schools(df,dataset='predict')
    df = dp.binarize_schools(df,dataset='predict')
    df = dp.clean_age(df)
    df = dp.clean_booleans(df)
    df = dp.clean_room_features(df)
    df = dp.clean_postal_codes(df)
    df = dp.clean_multi_label_features(df)
    df = dp.multi_label_binarizer(df,dataset='predict')
    df = dp.normalize_numeric(df,dataset='predict')
    df = dp.ohe_categories(df,dataset='predict')
    return df

X_test = preprocess_test(X_test)

Reduced district nulls from (703, 715, 741) to (247, 250, 258) after.
accessibility_features has 208 unique categories.
heating has 82 unique categories.
water_source has 26 unique categories.
sewer has 27 unique categories.
lot_features has 211 unique categories.
roof has 31 unique categories.
community_features has 65 unique categories.
laundry_features has 45 unique categories.
cooling has 52 unique categories.
association_fee_includes has 84 unique categories.
mrd_din has 7 unique categories.
mrd_ext has 34 unique categories.
mrd_fireplace_location has 16 unique categories.
ownership has 6 unique categories.
mrd_bas has 25 unique categories.
mrd_pkn has 4 unique categories.
mrd_tpc has 30 unique categories.
mrd_tpe has 18 unique categories.




Index(['original_list_price', 'list_price', 'association_fee',
       'tax_annual_amount', 'days_on_market', 'cumulative_days_on_market',
       'previous_list_price', 'living_area', 'lot_size_acres', 'postal_long',
       'postal_lat'],
      dtype='object')


In [46]:
def preprocess_target_test(df):
    df = dp.normalize_target(df,dataset='predict')
    return df
    
y_test = preprocess_target_test(y_test)

In [47]:
X_train = X_train.drop(['postal_code'],axis=1)
X_test = X_test.drop(['postal_code'],axis=1)

X_train.mrd_tnu = X_train.mrd_tnu.fillna(0)
X_test.mrd_tnu = X_test.mrd_tnu.fillna(0)

In [48]:
# import xgboost as xgb
# from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
# from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

# #xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=80,learning_rate=0.11669835581158701, max_depth=6, subsample=0.6628468243767216)
# xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=80)

# # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# y_pred = xgb_model.predict(X_test)

In [49]:
# Put data into GPU
X_train_cp = cp.array(X_train.to_numpy().astype('float32'))
X_test_cp = cp.array(X_test.to_numpy().astype('float32'))
y_train_cp = cp.array(y_train.to_numpy().astype('float32'))
y_test_cp = cp.array(y_test.to_numpy().astype('float32'))

In [50]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK

# Define the hyperparameter space
space = {
    # 'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'n_estimators': hp.choice('n_estimators', np.arange(10, 120, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'max_leaves': hp.choice('max_leaves', np.arange(0, 6, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', -5, -2),
    'subsample': hp.uniform('subsample', 0.25, 1)
}

# Define the objective function to minimize
def objective(params):
    xgb_model = xgb.XGBRegressor(device = 'cuda',objective="reg:squarederror", **params)
    # xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train_cp, y_train_cp)
    y_pred = xgb_model.predict(X_test_cp)
    score = mean_squared_error(y_test_cp.get(), y_pred)
    return {'loss': score, 'status': STATUS_OK}

# Perform the optimization
best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)
print("Best set of hyperparameters: ", best_params)

100%|██████| 100/100 [02:21<00:00,  1.42s/trial, best loss: 4.5959554881847e-06]
Best set of hyperparameters:  {'learning_rate': 0.10678426900942385, 'max_depth': 11, 'max_leaves': 0, 'n_estimators': 59, 'subsample': 0.7031297379916002}


In [54]:
xgb_model = xgb.XGBRegressor(device = 'cuda',
                             objective="reg:squarederror",
                             learning_rate = best_params['learning_rate'],
                             n_estimators = best_params['n_estimators'],
                             max_depth = best_params['max_depth'],
                             max_leaves = best_params['max_leaves'],
                             subsample = best_params['subsample'],
                            )
xgb_model.fit(X_train_cp, y_train_cp, eval_set=[(X_test_cp, y_test_cp)])
y_pred_np = xgb_model.predict(X_test_cp)

[0]	validation_0-rmse:0.00836
[1]	validation_0-rmse:0.00755
[2]	validation_0-rmse:0.00685
[3]	validation_0-rmse:0.00622
[4]	validation_0-rmse:0.00567
[5]	validation_0-rmse:0.00518
[6]	validation_0-rmse:0.00475
[7]	validation_0-rmse:0.00438
[8]	validation_0-rmse:0.00406
[9]	validation_0-rmse:0.00378
[10]	validation_0-rmse:0.00354
[11]	validation_0-rmse:0.00334
[12]	validation_0-rmse:0.00316
[13]	validation_0-rmse:0.00301
[14]	validation_0-rmse:0.00288
[15]	validation_0-rmse:0.00278
[16]	validation_0-rmse:0.00268
[17]	validation_0-rmse:0.00258
[18]	validation_0-rmse:0.00252
[19]	validation_0-rmse:0.00246
[20]	validation_0-rmse:0.00242
[21]	validation_0-rmse:0.00238
[22]	validation_0-rmse:0.00235
[23]	validation_0-rmse:0.00232
[24]	validation_0-rmse:0.00229
[25]	validation_0-rmse:0.00228
[26]	validation_0-rmse:0.00226
[27]	validation_0-rmse:0.00224
[28]	validation_0-rmse:0.00224
[29]	validation_0-rmse:0.00223
[30]	validation_0-rmse:0.00223
[31]	validation_0-rmse:0.00223
[32]	validation_0-

In [56]:
mse=mean_squared_error(y_test_cp.get(), y_pred_np)
print(np.sqrt(mse),mse)

0.0021736675 4.724831e-06


In [58]:
y_test_raw = dp.numeric_target_normalizer.inverse_transform(y_test_cp.get().reshape(-1, 1))
y_pred_raw = dp.numeric_target_normalizer.inverse_transform(y_pred_np.reshape(-1, 1))

In [59]:
mse=mean_squared_error(y_test_raw, y_pred_raw)
print(np.sqrt(mse))

78252.02


In [63]:
feature_vals = xgb_model.feature_importances_

In [None]:
feature_names = xgb_model.feature_names_in_
df_from_arr = pd.DataFrame(data=[feature_names,feature_vals]).T
df_from_arr.sort_values(by=1,ascending=False).head(30)

In [67]:
X_test.to_records(index=False)[0]

(0.01780892, 0.01780892, 0., 0.00067523, 0.10227829, 0.00086602, 0., 0.04165217, 3.3471074e-06, 1889, False, False, True, False, False, 0., 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0.03634453, 0.9094314, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 0, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, 0, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 0, False, True, 0, False, False, False, True, 0, False, False, 0, False, False, False, True, False, False, False, False, 1, False, False, False, False, 0, False, False, True, False, False, False, 0, False, True, False, False, False, False, False, False, False, False, False, 

In [622]:
import numpy as np
from catboost import Pool, CatBoostRegressor
# initialize data
train_data = X_train
train_label = y_train
test_data = X_train
# initialize Pool
train_pool = Pool(train_data, 
                  train_label)
test_pool = Pool(test_data) 

# specify the training parameters 
model = CatBoostRegressor(iterations=100, 
                          depth=15, 
                          learning_rate=.01, 
                          loss_function='RMSE')
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)

0:	learn: 0.0093422	total: 2.69s	remaining: 4m 26s
1:	learn: 0.0092625	total: 4.82s	remaining: 3m 56s
2:	learn: 0.0091860	total: 7.08s	remaining: 3m 49s
3:	learn: 0.0091071	total: 9.36s	remaining: 3m 44s
4:	learn: 0.0090297	total: 12.1s	remaining: 3m 49s
5:	learn: 0.0089529	total: 14.9s	remaining: 3m 52s
6:	learn: 0.0088756	total: 17.8s	remaining: 3m 56s
7:	learn: 0.0087993	total: 20.3s	remaining: 3m 53s
8:	learn: 0.0087249	total: 20.8s	remaining: 3m 30s
9:	learn: 0.0086506	total: 23.2s	remaining: 3m 28s
10:	learn: 0.0085784	total: 26s	remaining: 3m 30s
11:	learn: 0.0085064	total: 28.8s	remaining: 3m 30s
12:	learn: 0.0084345	total: 31.9s	remaining: 3m 33s
13:	learn: 0.0083635	total: 34.8s	remaining: 3m 34s
14:	learn: 0.0082933	total: 37.5s	remaining: 3m 32s
15:	learn: 0.0082241	total: 40.1s	remaining: 3m 30s
16:	learn: 0.0081551	total: 42.7s	remaining: 3m 28s
17:	learn: 0.0080874	total: 45.3s	remaining: 3m 26s
18:	learn: 0.0080198	total: 47.8s	remaining: 3m 23s
19:	learn: 0.0079544	tot