# STAT 857 - W23 Project 2
## Evan Callaghan | April 17, 2023

### 1. Configuring setup
Installing packages and loading libraries

In [None]:
pip install lightgbm xgboost optuna

In [94]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import math
import optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

pd.set_option('display.max_columns', None, 'display.max_rows', None)

### 2. Data Exploration Section

Reading the competition data files, exploring the training data, creating data visualizations 

In [95]:
## Reading the data
train = pd.read_csv('Data/W23P2_train.csv')
test = pd.read_csv('Data/W23P2_test.csv')
sub = pd.read_csv('Data/Sample_Submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['interest_level'].value_counts() / train.shape[0]

In [None]:
train.describe()

In [None]:
test.describe()

### 3. Data Cleaning
Removing some outlier observations and cleaning street_address variable

In [96]:
## Transforming interest_level label to numeric
train['interest_level'] = np.where(train['interest_level'] == 'low', 0, 
                                   np.where(train['interest_level'] == 'medium', 1, 2))

## Removing training observations with prices higher than $40,000
train = train[train['price'] < 40000].reset_index(drop = True)

## Removing outlier locations
#train = train[(train['latitude'] < 42) & (train['latitude'] > 40)].reset_index(drop = True)

## Log transformation on the price variable
# train['price'] = np.log(train['price'])
# test['price'] = np.log(test['price'])

## Changing street address variable to all lowercase
train['street_address'] = train['street_address'].str.lower() + ' '
test['street_address'] = test['street_address'].str.lower() + ' '

## Cleaning street address variable more
train['street_address'] = train['street_address'].str.replace(' st ', ' street', regex = True)
train['street_address'] = train['street_address'].str.replace(' st. ', ' street', regex = True)
train['street_address'] = train['street_address'].str.replace(' blvd ', ' boulevard ', regex = True)
train['street_address'] = train['street_address'].str.replace(' ave ', ' avenue ', regex = True)
train['street_address'] = train['street_address'].str.replace(' ave. ', ' avenue ', regex = True)
train['street_address'] = train['street_address'].str.replace(' dr ', ' drive ', regex = True)
train['street_address'] = train['street_address'].str.replace(' pl ', ' place ', regex = True)
train['street_address'] = train['street_address'].str.replace(' e ', ' east ', regex = True)
train['street_address'] = train['street_address'].str.replace(' w ', ' west ', regex = True)
train['street_address'] = train['street_address'].str.replace(' first ', ' 1st ', regex = True)
train['street_address'] = train['street_address'].str.replace(' second ', ' 2nd ', regex = True)
train['street_address'] = train['street_address'].str.replace(' third ', ' 3rd ', regex = True)

test['street_address'] = test['street_address'].str.replace(' st ', ' street', regex = True)
test['street_address'] = test['street_address'].str.replace(' st. ', ' street', regex = True)
test['street_address'] = test['street_address'].str.replace(' blvd ', ' boulevard ', regex = True)
test['street_address'] = test['street_address'].str.replace(' ave ', ' avenue ', regex = True)
test['street_address'] = test['street_address'].str.replace(' ave. ', ' avenue ', regex = True)
test['street_address'] = test['street_address'].str.replace(' dr ', ' drive ', regex = True)
test['street_address'] = test['street_address'].str.replace(' pl ', ' place ', regex = True)
test['street_address'] = test['street_address'].str.replace(' e ', ' east ', regex = True)
test['street_address'] = test['street_address'].str.replace(' w ', ' west ', regex = True)
test['street_address'] = test['street_address'].str.replace(' first ', ' 1st ', regex = True)
test['street_address'] = test['street_address'].str.replace(' second ', ' 2nd ', regex = True)
test['street_address'] = test['street_address'].str.replace(' third ', ' 3rd ', regex = True)

### 4. Variable Engineering
Creating new variables based on provided amenities and apartment pricing

In [97]:
## Combining data frames for feature engineering purposes
train.insert(0, 'train', 1)
test.insert(0, 'train', 0)
full_data = pd.concat([train.drop(columns = ['interest_level']), test.drop(columns = ['ID'])])

In [98]:
full_data['log_price'] = np.log(full_data['price'])

In [99]:
full_data['half_bathrooms'] = full_data['bathrooms'] - full_data['bathrooms'].apply(int)

In [100]:
full_data['pos'] = full_data['longitude'].round(3).astype(str) + '_' + full_data['latitude'].round(3).astype(str)

vals = full_data['pos'].value_counts()
dvals = vals.to_dict()
full_data['density'] = full_data['pos'].apply(lambda x: dvals.get(x, vals.min()))

In [101]:
## Adding geolocation features
full_data['geo_area_50'] = full_data[['latitude', 'longitude']]\
.apply(lambda x:(int(x[0] * 50) % 50) * 50 + (int(-x[1] * 50) % 50), axis = 1)                                         
                         
full_data['geo_area_100'] = full_data[['latitude', 'longitude']]\
.apply(lambda x:(int(x[0] * 100) % 100) * 100 + (int(-x[1] * 100) % 100), axis = 1)                                         
  
full_data['geo_area_200'] = full_data[['latitude', 'longitude']]\
.apply(lambda x:(int(x[0] * 200) % 200) * 200 + (int(-x[1] * 200) % 200), axis = 1)                                         

## Financial district
lat = 40.705628
lon = -74.010278
full_data['distance_to_fin'] = full_data[['latitude', 'longitude']]\
.apply(lambda x:math.sqrt((x[0] - lat)**2 + (x[1] - lon)**2), axis = 1)

## Central park
lat = 40.785091
lon = -73.968285
full_data['distance_to_cp'] = full_data[['latitude', 'longitude']]\
.apply(lambda x:math.sqrt((x[0] - lat)**2 + (x[1] - lon)**2), axis = 1)

In [102]:
## Number of total rooms
full_data['rooms'] = full_data['bedrooms'] + full_data['bathrooms'] 

## Number of categorical features listed 
full_data['num_of_features'] = full_data.iloc[:, 7:67].sum(axis = 1)

## Price per room
full_data['price_per_room'] = full_data[['price','rooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

## Price per bedroom
full_data['price_per_bedroom'] = full_data[['price','bedrooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

## Price per bathroom
full_data['price_per_bathroom'] = full_data[['price','bathrooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

## Price per feature listed
full_data['price_per_feature'] = full_data[['price','num_of_features']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

## Features per room
full_data['features_per_room'] = full_data[['num_of_features','rooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

## Features per bedroom
full_data['features_per_bedroom'] = full_data[['num_of_features','bedrooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

## Features per bathroom
full_data['features_per_bathroom'] = full_data[['num_of_features','bathrooms']].apply(lambda x: x[0]/x[1] if x[1]!=0 else 0, axis = 1)

In [103]:
## Making sure address variable is a string
full_data['street_address'] = full_data['street_address'].astype(str)

## Getting a count of observations with same address
street = full_data['street_address'].value_counts()

## Getting a count of observations with same number of bedrooms
bedrooms = full_data['bedrooms'].value_counts()

## Getting a count of observations with same number of bathrooms
bathrooms = full_data['bathrooms'].value_counts()

## Adding count information
full_data['street_count'] = full_data['street_address'].apply(lambda x:street[x] if x == 'nan' else street[x])
full_data['bedrooms_count'] = full_data['bedrooms'].apply(lambda x:bedrooms[x])
full_data['bathrooms_count'] = full_data['bathrooms'].apply(lambda x:bathrooms[x])

In [104]:
## Aggregating price variables by street address and computing summary statistics
price_by_address = full_data.groupby('street_address')['price'].agg([np.min, np.max, np.median, np.mean]).reset_index()
price_by_address.columns = ['street_address','min_price_by_address',
                            'max_price_by_address','median_price_by_address','mean_price_by_address']

## Adding aggregated price info
full_data = pd.merge(full_data, price_by_address, how = 'left', on = 'street_address')

## Computing percentile from aggregated price info
full_data['price_percentile_by_address']=full_data[['price','min_price_by_address','max_price_by_address']]\
.apply(lambda x:(x[0] - x[1]) / (x[2] - x[1]) if (x[2] - x[1]) != 0 else 0.5, axis = 1)

In [105]:
## Computing market price for combination of address, bedrooms, and bathrooms
mkt_price = full_data.groupby(['street_address', 'bedrooms', 'bathrooms']).price.mean().reset_index()

## Adding market price information
mkt_price = pd.merge(full_data[['street_address', 'bedrooms', 'bathrooms']], mkt_price, how = 'left', 
                     on = ['street_address', 'bedrooms', 'bathrooms']).price
full_data['mkt_price'] = mkt_price.values

## Computing the difference to market price
full_data['diff_to_mkt_price'] = full_data['price'] - full_data['mkt_price']

## Computing the ratio to market price
full_data['ratio_to_mkt_price'] = full_data['price'] / full_data['mkt_price']

In [106]:
## Label encoding categorical variables
cat_vars = list(full_data.iloc[:, 7:170].columns)
LE_vars = []
for cat_var in cat_vars:
    LE_var = cat_var
    full_data[LE_var] = LabelEncoder().fit_transform(full_data[cat_var])
    LE_vars.append(LE_var)
    
## OneHot Encoding all catategorical variables
oh_encoder = OneHotEncoder(sparse_output = True).fit(full_data[LE_vars])
oh_sparse = oh_encoder.transform(full_data[LE_vars])

In [107]:
## Condensing redundant variables after feature engineering

laundry_vars = ['Laundry.in.Building', 'Laundry.in.Unit', 'Laundry.In.Building', 'Laundry.In.Unit', 'LAUNDRY', 'Washer.in.Unit', 
                'Dryer.in.Unit', 'Laundry.Room', 'Laundry', 'On.site.laundry', 'On.site.Laundry', 'Washer.Dryer', 'Washer.Dryer.in.building', 
                'In.Unit.Washer.Dryer', 'Washer...Dryer', 'Washer.Dryer.in.Unit']
parking_vars = ['Parking.Space', 'Garage', 'Parking', 'On.site.Garage', 'assigned.parking.space', 'Common.parking.Garage', 'Full.Service.Garage', 
               'On.site.Parking.Lot', 'Private.parking']
valet_vars = ['Valet.Parking', 'Valet']
deck_vars = ['Roof.Deck', 'Balcony', 'Terrace', 'Patio', 'Roof.deck', 'balcony', 'terrace', 'patio', 'private.balcony', 'Private.balcony', 
             'Private.Deck', 'Common.roof.deck', 'ROOFDECK']
outdoor_vars = ['Courtyard', 'Outdoor.Entertainment.Space', 'Private.Outdoor.Space', 'private.outdoor.space', 'Private.outdoor.space', 
                'Common.Outdoor.Space', 'PublicOutdoor', 'Outdoor.Space', 'Outdoor.Areas', 'Common.backyard', 'building.common.outdoor.space']
garden_vars = ['Common.garden', 'garden', 'Garden.Patio', 'Garden', 'Residents.Garden']
dishwasher_vars = ['Dishwasher', 'dishwasher']
gym_vars = ['Fitness.Center', 'Gym.Fitness', 'Health.Club', 'Gym', 'gym', 'Gym.In.Building']
pool_vars = ['Swimming.Pool', 'Pool', 'pool', 'Indoor.Pool']
elevator_vars = ['Elevator', 'elevator']
storage_vars = ['Storage', 'storage', 'Basement.Storage']
internet_vars = ['High.Speed.Internet', 'WiFi', 'WiFi.Access']
bike_vars = ['Bike.room', 'Bike.Room']
pet_friendly_vars = ['Dogs.Allowed', 'Cats.Allowed', 'Pet.Friendly', 'Pets.on.approval']
concierge_vars = ['Concierge', 'Concierge.Service', 'X24.7.Concierge']
doorman_vars = ['Doorman', 'Full.time.doorman', 'Virtual.Doorman', 'FT.Doorman', 'doorman']
super_vars = ['LIVE.IN.SUPER', 'Live.in.superintendent', 'Live.In.Superintendent','Live.in.Super', 'Live.In.Super']
hardwood_vars = ['Hardwood.Floors', 'HARDWOOD', 'Hardwood.floors', 'Hardwood']
ceiling_vars = ['High.ceilings', 'High.Ceilings', 'HIGH.CEILINGS', 'High.Ceiling']
brick_vars = ['EXPOSED.BRICK', 'Exposed.Brick']
construction_vars = ['New.Construction', 'Newly.renovated','Renovated', 'renovated', 'New.construction']
photo_vars = ['Actual.Apt..Photos', 'ACTUAL.APT..PHOTOS']
lounge_vars = ['Residents.Lounge', 'Lounge.room', 'Lounge']
playroom_vars = ['Childrens.Playroom', 'Children.s.Playroom']
ac_vars = ['Central.A.C', 'Air.conditioning']
kitchen_vars = ['EAT.IN.KITCHEN','Eat.In.Kitchen']
no_fee_vars = ['No.Fee', 'NO.FEE']
accessibity_vars = ['Wheelchair.Ramp', 'Wheelchair.Access']
multi_level_vars = ['Multi.Level', 'Multi.level']
fireplace_vars = ['Fireplace', 'Decorative.Fireplace']
highrise_vars = ['Hi.Rise', 'HIGHRISE']
marble_bath_vars = ['Marble.Bath', 'Marble.Bathroom']
prewar_vars = ['Pre.War', 'prewar', 'Prewar']

def condense(data):
    
    ## Condensing data
    data['Has_Laundry'] = np.where(np.sum(data[laundry_vars], axis = 1) > 0, 1, 0)
    data['Has_Parking'] = np.where(np.sum(data[parking_vars], axis = 1) > 0, 1, 0)
    data['Has_Valet'] = np.where(np.sum(data[valet_vars], axis = 1) > 0, 1, 0)
    data['Has_Deck'] = np.where(np.sum(data[deck_vars], axis = 1) > 0, 1, 0)
    data['Has_Outdoor_Area'] = np.where(np.sum(data[outdoor_vars], axis = 1) > 0, 1, 0)
    data['Has_Garden'] = np.where(np.sum(data[garden_vars], axis = 1) > 0, 1, 0)
    data['Has_Dishwasher'] = np.where(np.sum(data[dishwasher_vars], axis = 1) > 0, 1, 0)
    data['Has_Gym'] = np.where(np.sum(data[gym_vars], axis = 1) > 0, 1, 0)
    data['Has_Pool'] = np.where(np.sum(data[pool_vars], axis = 1) > 0, 1, 0)
    data['Has_Elevator'] = np.where(np.sum(data[elevator_vars], axis = 1) > 0, 1, 0)
    data['Has_Storage'] = np.where(np.sum(data[storage_vars], axis = 1) > 0, 1, 0)
    data['Has_Wifi'] = np.where(np.sum(data[internet_vars], axis = 1) > 0, 1, 0)
    data['Has_Bike'] = np.where(np.sum(data[bike_vars], axis = 1) > 0, 1, 0)
    data['Pet_Friendly'] = np.where(np.sum(data[pet_friendly_vars], axis = 1) > 0, 1, 0)
    data['Has_Concierge'] = np.where(np.sum(data[concierge_vars], axis = 1) > 0, 1, 0)
    data['Has_Doorman'] = np.where(np.sum(data[doorman_vars], axis = 1) > 0, 1, 0)
    data['Has_Super'] = np.where(np.sum(data[super_vars], axis = 1) > 0, 1, 0)
    data['Has_Hardwood_Floor'] = np.where(np.sum(data[hardwood_vars], axis = 1) > 0, 1, 0)
    data['Has_High_Ceilings'] = np.where(np.sum(data[ceiling_vars], axis = 1) > 0, 1, 0)
    data['Has_Brick'] = np.where(np.sum(data[brick_vars], axis = 1) > 0, 1, 0)
    data['Has_Renovated'] = np.where(np.sum(data[construction_vars], axis = 1) > 0, 1, 0)
    data['Has_Photos'] = np.where(np.sum(data[photo_vars], axis = 1) > 0, 1, 0)
    data['Has_Lounge'] = np.where(np.sum(data[lounge_vars], axis = 1) > 0, 1, 0)
    data['Has_Playroom'] = np.where(np.sum(data[playroom_vars], axis = 1) > 0, 1, 0)
    data['Has_AC'] = np.where(np.sum(data[ac_vars], axis = 1) > 0, 1, 0)
    data['Has_Kitchen'] = np.where(np.sum(data[kitchen_vars], axis = 1) > 0, 1, 0)
    data['No_Fee'] = np.where(np.sum(data[no_fee_vars], axis = 1) > 0, 1, 0)
    data['Accessible'] = np.where(np.sum(data[accessibity_vars], axis = 1) > 0, 1, 0)
    data['Multi_Level'] = np.where(np.sum(data[multi_level_vars], axis = 1) > 0, 1, 0)
    data['Fire'] = np.where(np.sum(data[fireplace_vars], axis = 1) > 0, 1, 0)
    data['Highrise'] = np.where(np.sum(data[highrise_vars], axis = 1) > 0, 1, 0)
    data['Marble_Bath'] = np.where(np.sum(data[marble_bath_vars], axis = 1) > 0, 1, 0)
    data['Pre_War'] = np.where(np.sum(data[prewar_vars], axis = 1) > 0, 1, 0)
    return data
    
## Applying function to full_data
full_data = condense(full_data)

## Dropping unnecessary columns
to_drop = [laundry_vars, parking_vars, valet_vars, deck_vars, outdoor_vars,garden_vars, dishwasher_vars, gym_vars, pool_vars, elevator_vars, 
           storage_vars, internet_vars, bike_vars, pet_friendly_vars, concierge_vars, doorman_vars, super_vars, hardwood_vars, ceiling_vars, 
           brick_vars, construction_vars, photo_vars, lounge_vars, playroom_vars, ac_vars, kitchen_vars, no_fee_vars, accessibity_vars, 
           multi_level_vars, fireplace_vars, highrise_vars, marble_bath_vars, prewar_vars] 

for cols in to_drop:
    full_data = full_data.drop(columns = cols)

In [108]:
full_data.head()

Unnamed: 0,train,bathrooms,bedrooms,latitude,longitude,price,street_address,Dining.Room,Reduced.Fee,Exclusive,No.pets,LOWRISE,SIMPLEX,Furnished,Loft,Stainless.Steel.Appliances,View,Green.Building,Short.Term.Allowed,Subway,Granite.Kitchen,Light,Guarantors.Accepted,Skylight,Sauna,Live.Work,Duplex,Walk.in.Closet.s.,Luxury.building,Post.War,Cable.Satellite.TV,Microwave,Sublet,Shares.OK,log_price,half_bathrooms,pos,density,geo_area_50,geo_area_100,geo_area_200,distance_to_fin,distance_to_cp,rooms,num_of_features,price_per_room,price_per_bedroom,price_per_bathroom,price_per_feature,features_per_room,features_per_bedroom,features_per_bathroom,street_count,bedrooms_count,bathrooms_count,min_price_by_address,max_price_by_address,median_price_by_address,mean_price_by_address,price_percentile_by_address,mkt_price,diff_to_mkt_price,ratio_to_mkt_price,Has_Laundry,Has_Parking,Has_Valet,Has_Deck,Has_Outdoor_Area,Has_Garden,Has_Dishwasher,Has_Gym,Has_Pool,Has_Elevator,Has_Storage,Has_Wifi,Has_Bike,Pet_Friendly,Has_Concierge,Has_Doorman,Has_Super,Has_Hardwood_Floor,Has_High_Ceilings,Has_Brick,Has_Renovated,Has_Photos,Has_Lounge,Has_Playroom,Has_AC,Has_Kitchen,No_Fee,Accessible,Multi_Level,Fire,Highrise,Marble_Bath,Pre_War
0,1,1.0,4,40.7831,-73.9449,3675,1962 1st avenue,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.209308,0.0,-73.945_40.783,1,1997,7894,31388,0.101372,0.02347,5.0,5,735.0,918.75,3675.0,735.0,1.0,1.25,5.0,1,661,13599,3675,3675,3675.0,3675.0,0.5,3675.0,0.0,1.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,1.0,3,40.7542,-73.9724,3600,155 east 47th street,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.188689,0.0,-73.972_40.754,16,1898,7597,30194,0.061595,0.031164,4.0,7,900.0,1200.0,3600.0,514.285714,1.75,2.333333,7.0,4,2538,13599,3600,4200,3600.0,3750.0,0.0,3750.0,-150.0,0.96,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,1.0,2,40.7285,-73.979,2900,206 avenue b,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7.972466,0.0,-73.979_40.728,13,1848,7297,29195,0.038748,0.057596,3.0,2,966.666667,1450.0,2900.0,1450.0,0.666667,1.0,2.0,5,5114,13599,2900,3000,2900.0,2940.0,0.0,2940.0,-40.0,0.986395,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,2.0,4,40.7306,-73.9837,7200,352 east 13 street,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.881836,0.0,-73.984_40.731,16,1849,7398,29396,0.036469,0.056629,6.0,6,1200.0,1800.0,3600.0,1200.0,1.0,1.5,3.0,2,661,2332,7200,7200,7200.0,7200.0,0.5,7200.0,0.0,1.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,1,1.0,1,40.7645,-73.984,2500,260 west 54 street,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7.824046,0.0,-73.984_40.764,50,1949,7698,30596,0.064471,0.025903,2.0,10,1250.0,2500.0,2500.0,250.0,5.0,10.0,10.0,16,4815,13599,2500,5300,3962.5,3987.1875,0.0,2500.0,0.0,1.0,1,0,0,0,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [109]:
cols1 = list(full_data.iloc[:, 7:34].columns)

In [110]:
full_data = full_data.drop(columns = cols1)

In [111]:
full_data.head()

Unnamed: 0,train,bathrooms,bedrooms,latitude,longitude,price,street_address,log_price,half_bathrooms,pos,density,geo_area_50,geo_area_100,geo_area_200,distance_to_fin,distance_to_cp,rooms,num_of_features,price_per_room,price_per_bedroom,price_per_bathroom,price_per_feature,features_per_room,features_per_bedroom,features_per_bathroom,street_count,bedrooms_count,bathrooms_count,min_price_by_address,max_price_by_address,median_price_by_address,mean_price_by_address,price_percentile_by_address,mkt_price,diff_to_mkt_price,ratio_to_mkt_price,Has_Laundry,Has_Parking,Has_Valet,Has_Deck,Has_Outdoor_Area,Has_Garden,Has_Dishwasher,Has_Gym,Has_Pool,Has_Elevator,Has_Storage,Has_Wifi,Has_Bike,Pet_Friendly,Has_Concierge,Has_Doorman,Has_Super,Has_Hardwood_Floor,Has_High_Ceilings,Has_Brick,Has_Renovated,Has_Photos,Has_Lounge,Has_Playroom,Has_AC,Has_Kitchen,No_Fee,Accessible,Multi_Level,Fire,Highrise,Marble_Bath,Pre_War
0,1,1.0,4,40.7831,-73.9449,3675,1962 1st avenue,8.209308,0.0,-73.945_40.783,1,1997,7894,31388,0.101372,0.02347,5.0,5,735.0,918.75,3675.0,735.0,1.0,1.25,5.0,1,661,13599,3675,3675,3675.0,3675.0,0.5,3675.0,0.0,1.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,1.0,3,40.7542,-73.9724,3600,155 east 47th street,8.188689,0.0,-73.972_40.754,16,1898,7597,30194,0.061595,0.031164,4.0,7,900.0,1200.0,3600.0,514.285714,1.75,2.333333,7.0,4,2538,13599,3600,4200,3600.0,3750.0,0.0,3750.0,-150.0,0.96,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,1.0,2,40.7285,-73.979,2900,206 avenue b,7.972466,0.0,-73.979_40.728,13,1848,7297,29195,0.038748,0.057596,3.0,2,966.666667,1450.0,2900.0,1450.0,0.666667,1.0,2.0,5,5114,13599,2900,3000,2900.0,2940.0,0.0,2940.0,-40.0,0.986395,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,2.0,4,40.7306,-73.9837,7200,352 east 13 street,8.881836,0.0,-73.984_40.731,16,1849,7398,29396,0.036469,0.056629,6.0,6,1200.0,1800.0,3600.0,1200.0,1.0,1.5,3.0,2,661,2332,7200,7200,7200.0,7200.0,0.5,7200.0,0.0,1.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,1,1.0,1,40.7645,-73.984,2500,260 west 54 street,7.824046,0.0,-73.984_40.764,50,1949,7698,30596,0.064471,0.025903,2.0,10,1250.0,2500.0,2500.0,250.0,5.0,10.0,10.0,16,4815,13599,2500,5300,3962.5,3987.1875,0.0,2500.0,0.0,1.0,1,0,0,0,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [112]:
## Splitting full_data back into training and testing sets
training = full_data[full_data['train'] == 1].drop(columns = ['train', 'street_address', 'pos']).reset_index(drop = True)
training['interest_level'] = train['interest_level']

testing = full_data[full_data['train'] == 0].drop(columns = ['train', 'street_address', 'pos']).reset_index(drop = True)
testing['ID'] = test['ID']

In [113]:
training.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,log_price,half_bathrooms,density,geo_area_50,geo_area_100,geo_area_200,distance_to_fin,distance_to_cp,rooms,num_of_features,price_per_room,price_per_bedroom,price_per_bathroom,price_per_feature,features_per_room,features_per_bedroom,features_per_bathroom,street_count,bedrooms_count,bathrooms_count,min_price_by_address,max_price_by_address,median_price_by_address,mean_price_by_address,price_percentile_by_address,mkt_price,diff_to_mkt_price,ratio_to_mkt_price,Has_Laundry,Has_Parking,Has_Valet,Has_Deck,Has_Outdoor_Area,Has_Garden,Has_Dishwasher,Has_Gym,Has_Pool,Has_Elevator,Has_Storage,Has_Wifi,Has_Bike,Pet_Friendly,Has_Concierge,Has_Doorman,Has_Super,Has_Hardwood_Floor,Has_High_Ceilings,Has_Brick,Has_Renovated,Has_Photos,Has_Lounge,Has_Playroom,Has_AC,Has_Kitchen,No_Fee,Accessible,Multi_Level,Fire,Highrise,Marble_Bath,Pre_War,interest_level
0,1.0,4,40.7831,-73.9449,3675,8.209308,0.0,1,1997,7894,31388,0.101372,0.02347,5.0,5,735.0,918.75,3675.0,735.0,1.0,1.25,5.0,1,661,13599,3675,3675,3675.0,3675.0,0.5,3675.0,0.0,1.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1.0,3,40.7542,-73.9724,3600,8.188689,0.0,16,1898,7597,30194,0.061595,0.031164,4.0,7,900.0,1200.0,3600.0,514.285714,1.75,2.333333,7.0,4,2538,13599,3600,4200,3600.0,3750.0,0.0,3750.0,-150.0,0.96,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,1.0,2,40.7285,-73.979,2900,7.972466,0.0,13,1848,7297,29195,0.038748,0.057596,3.0,2,966.666667,1450.0,2900.0,1450.0,0.666667,1.0,2.0,5,5114,13599,2900,3000,2900.0,2940.0,0.0,2940.0,-40.0,0.986395,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2.0,4,40.7306,-73.9837,7200,8.881836,0.0,16,1849,7398,29396,0.036469,0.056629,6.0,6,1200.0,1800.0,3600.0,1200.0,1.0,1.5,3.0,2,661,2332,7200,7200,7200.0,7200.0,0.5,7200.0,0.0,1.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1
4,1.0,1,40.7645,-73.984,2500,7.824046,0.0,50,1949,7698,30596,0.064471,0.025903,2.0,10,1250.0,2500.0,2500.0,250.0,5.0,10.0,10.0,16,4815,13599,2500,5300,3962.5,3987.1875,0.0,2500.0,0.0,1.0,1,0,0,0,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,2


In [114]:
testing.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,log_price,half_bathrooms,density,geo_area_50,geo_area_100,geo_area_200,distance_to_fin,distance_to_cp,rooms,num_of_features,price_per_room,price_per_bedroom,price_per_bathroom,price_per_feature,features_per_room,features_per_bedroom,features_per_bathroom,street_count,bedrooms_count,bathrooms_count,min_price_by_address,max_price_by_address,median_price_by_address,mean_price_by_address,price_percentile_by_address,mkt_price,diff_to_mkt_price,ratio_to_mkt_price,Has_Laundry,Has_Parking,Has_Valet,Has_Deck,Has_Outdoor_Area,Has_Garden,Has_Dishwasher,Has_Gym,Has_Pool,Has_Elevator,Has_Storage,Has_Wifi,Has_Bike,Pet_Friendly,Has_Concierge,Has_Doorman,Has_Super,Has_Hardwood_Floor,Has_High_Ceilings,Has_Brick,Has_Renovated,Has_Photos,Has_Lounge,Has_Playroom,Has_AC,Has_Kitchen,No_Fee,Accessible,Multi_Level,Fire,Highrise,Marble_Bath,Pre_War,ID
0,1.0,0,40.7769,-73.9467,1945,7.573017,0.0,18,1947,7794,31189,0.095508,0.023087,1.0,5,1945.0,0.0,1945.0,389.0,5.0,0.0,5.0,8,3226,13599,1900,2650,1950.0,2026.25,0.06,1937.142857,7.857143,1.004056,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1.0,0,40.761,-73.999,2400,7.783224,0.0,72,1949,7699,30599,0.056509,0.039036,1.0,11,2400.0,0.0,2400.0,218.181818,11.0,0.0,11.0,18,3226,13599,2375,5000,2900.0,3267.277778,0.009524,2581.5,-181.5,0.929692,1,0,0,1,0,0,1,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2
2,1.0,2,40.7528,-73.9709,3750,8.229511,0.0,14,1898,7597,30194,0.061448,0.032397,3.0,3,1250.0,1875.0,3750.0,1250.0,1.0,1.5,3.0,2,5114,13599,3750,3995,3872.5,3872.5,0.0,3872.5,-122.5,0.968367,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
3,1.0,3,40.736,-73.986,4450,8.400659,0.0,16,1849,7398,29597,0.038883,0.05219,4.0,10,1112.5,1483.333333,4450.0,445.0,2.5,3.333333,10.0,1,2538,13599,4450,4450,4450.0,4450.0,0.5,4450.0,0.0,1.0,1,1,0,0,1,0,1,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,4
4,1.0,2,40.7084,-74.0048,5425,8.598773,0.0,7,1750,7000,28200,0.006139,0.08494,3.0,5,1808.333333,2712.5,5425.0,1085.0,1.666667,2.5,5.0,6,5114,13599,3015,5425,4530.0,4472.5,1.0,5151.666667,273.333333,1.053057,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5


### 4. Feature Selection
Performing recursive feature elimination to determine which variables are the most influential

In [None]:
## Using RFE with RandomForestClassifer to identify most important features
def flat_list(my_list):
    
    ## Defining list to store results
    out_list = list()
    for i in my_list:
        out_list += i
    return out_list

def RF_RFE_rep_cross_val(X, Y, numb_folds, max_features, numb_reps):
    
    ## Defining list to store results
    RFE_rep_results = list()
    for i in range(2, max_features):
        RFE_rep_results.append(RF_rep_cross_val(X, Y, numb_folds, i, numb_reps))
        print('Features -->', i) ## Sanity check
    return RFE_rep_results

def RF_rep_cross_val(X, Y, numb_folds, numb_features, numb_reps):
    
    ## Defining the list to store results
    rep_results = list()
    for i in range(0, numb_reps):
        rep_results.append(RF_cross_val(X, Y, numb_folds, numb_features))
    return flat_list(rep_results)

def RF_cross_val(X, Y, numb_folds, numb_features):
    
    ## Defining list to store results
    results = list()
    
    ## Defining the number of folds
    kf = KFold(n_splits = numb_folds, shuffle = True)
    
    for train_index, test_index in kf.split(X):
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        ## Running RFE with i features
        RF_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 100, max_depth = 5), 
                     n_features_to_select = numb_features).fit(X_train, Y_train)
        
        ## Variables to be considered
        to_select = X_train.columns[RF_rfe.support_]
        to_select_list.append(RF_rfe.support_)
        
        ## Building the Random Forest model
        X_train_md = X_train[to_select]
        X_test_md = X_test[to_select]
        
        RF_md = RandomForestClassifier(n_estimators = 100, max_depth = 5).fit(X_train_md, Y_train)
        
        ## Predicting on the test data-frame and storing RMSE
        results.append(log_loss(Y_test, RF_md.predict_proba(X_test_md)))

    return results

## Defining list to store results
to_select_list = list()

## Defining input and target variables
X = training.drop(columns = ['interest_level']); Y = training['interest_level']

## Running RFE to estimate number of features to be selected
RFE_numb_features = RF_RFE_rep_cross_val(X, Y, numb_folds = 5, max_features = 31, numb_reps = 1)

In [None]:
## Identifying features
features = pd.DataFrame(to_select_list)
features.columns = X.columns
feature_selections = 100 * features.apply(np.sum, axis = 0) / features.shape[0]
feature_selections = pd.DataFrame(feature_selections).reset_index(drop = False)

## Model performance given the number of variables
feature_performance = pd.DataFrame(RFE_numb_features)
feature_performance.columns = [['Split_1', 'Split_2', 'Split_3']]
feature_performance['Mean'] = feature_performance.apply(np.mean, axis = 1)
feature_performance['Num_features'] = feature_performance.index + 2

In [None]:
feature_performance

In [None]:
feature_selections.sort_values(0, ascending = False)

### 5. Hyper-Parameter Tuning
Tuning the hyper-parameters for Random Forest, Hist Gradient Boosting, LightGBM, and XGBoost using Optuna framework

In [None]:
## Defining optuna objective functions

class rf_objective:

    def __init__(self, seed):
        self.seed = seed

    def __call__(self, trial):
        
        params = dict(criterion = 'log_loss',
                      n_estimators = trial.suggest_int('n_estimators', 100, 1500, step = 100),
                      max_depth = trial.suggest_int('max_depth', 3, 12, step = 1),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 100, step = 5),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 100, step = 5))
        scores = []
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = RandomForestClassifier(**params).fit(X_train, Y_train)

            preds_valid = model.predict_proba(X_valid)
            scores.append(log_loss(Y_valid, preds_valid))
        return np.mean(scores)
                                   
                                   
class xgb_objective:

    def __init__(self, seed):
        self.seed = seed

    def __call__(self, trial):
        
        params = dict(objective = 'multi:softprob',
                      eval_metric = 'mlogloss',
                      n_estimators = trial.suggest_int('n_estimators', 300, 1500, step = 100),
                      learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),
                      max_depth = trial.suggest_int('max_depth', 3, 12, step = 1),
                      gamma = trial.suggest_float('reg_alpha', 0, 100, step = 10),
                      min_child_weight = trial.suggest_int('min_child_weight', 0, 200, step = 10),
                      subsample = trial.suggest_float('subsample', 0.6, 1, step = 0.05), 
                      colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1, step = 0.05))
        scores = []
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = XGBClassifier(**params).fit(X_train, Y_train)

            preds_valid = model.predict_proba(X_valid)
            scores.append(log_loss(Y_valid, preds_valid))
        return np.mean(scores)
                

class lgbm_objective:

    def __init__(self, seed):
        self.seed = seed

    def __call__(self, trial):
        
        params = dict(objective = 'multiclass',
                      metric = 'multi_logloss',
                      n_estimators = trial.suggest_int('n_estimators', 300, 1500, step = 100),
                      learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),
                      max_depth = trial.suggest_int('max_depth', 3, 12, step = 1),
                      reg_alpha = trial.suggest_float('reg_alpha', 0.1, 10, log = True),
                      reg_lambda = trial.suggest_float('reg_lambda', 0.1, 10, log = True),
                      num_leaves = trial.suggest_int('num_leaves', 11, 101, step = 5),
                      subsample = trial.suggest_float('subsample', 0.4, 1, step = 0.05),
                      colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1, step = 0.05))
        scores = []
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = LGBMClassifier(**params).fit(X_train, Y_train)

            preds_valid = model.predict_proba(X_valid)
            scores.append(log_loss(Y_valid, preds_valid))
        return np.mean(scores)
                                   
class hist_objective:

    def __init__(self, seed):
        self.seed = seed

    def __call__(self, trial):
        
        params = dict(max_iter = trial.suggest_int('max_iter', 300, 1000, step = 100),
                      learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),
                      max_depth = trial.suggest_int('max_depth', 3, 12, step = 1),
                      l2_regularization = trial.suggest_float('l2_regularization', 0.1, 10))
        scores = []
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = HistGradientBoostingClassifier(**params).fit(X_train, Y_train)

            preds_valid = model.predict_proba(X_valid)
            scores.append(log_loss(Y_valid, preds_valid))
        return np.mean(scores)
    
## Defining SEED and Trials
SEED = 42
N_TRIALS = 100

## Defining input and target variables
X = training.drop(columns = ['interest_level'])
Y = training['interest_level']

## Executing the optimization
study_rf = optuna.create_study(direction = 'minimize')
study_rf.optimize(rf_objective(SEED), n_trials = N_TRIALS)

study_xgb = optuna.create_study(direction = 'minimize')
study_xgb.optimize(xgb_objective(SEED), n_trials = N_TRIALS)

study_lgbm = optuna.create_study(direction = 'minimize')
study_lgbm.optimize(lgbm_objective(SEED), n_trials = N_TRIALS)

study_hist = optuna.create_study(direction = 'minimize')
study_hist.optimize(hist_objective(SEED), n_trials = N_TRIALS)

In [None]:
print(study_rf.best_trial.params)
print(study_rf.best_trial.value)

print(study_xgb.best_trial.params)
print(study_xgb.best_trial.value)

print(study_lgbm.best_trial.params)
print(study_lgbm.best_trial.value)

print(study_hist.best_trial.params)
print(study_hist.best_trial.value)

### 6. Modelling
Building models with cross validation using optimized hyper-parameter sets

In [115]:
## Defining input and target variables
X = training.drop(columns = ['interest_level'])
Y = training['interest_level']

X_test = testing.drop(columns = ['ID'])

In [60]:
## RandomForest:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
                
    ## Building LightGBM model
    model = RandomForestClassifier(n_estimators = 700,
                                   max_depth = 12,
                                   min_samples_split = 5,
                                   min_samples_leaf = 5).fit(X_train, Y_train)   
    
    ## Predicting on X_val and X_test
    model_pred_val = model.predict_proba(X_val)
    model_pred_test = model.predict_proba(X_test)
        
    ## Computing log-loss
    score = log_loss(Y_val, model_pred_val)
    log_loss_scores.append(score)
    preds.append(model_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)

## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
rf_cv_score = np.mean(cv_scores)    
print('Average log-loss of the RandomForest model over 10-folds is:', rf_cv_score)

## Averaging RF model preds
rf_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
rf_preds_test.columns = model.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = rf_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/rf_submission.csv', index = False)

Fold 0 : log-loss-score ==> 0.9231286682052121
Fold 1 : log-loss-score ==> 0.9243758379722198
Fold 2 : log-loss-score ==> 0.932038802567641
Fold 3 : log-loss-score ==> 0.9069644286912494
Fold 4 : log-loss-score ==> 0.9431972363687662
Fold 5 : log-loss-score ==> 0.9177193247671528
Fold 6 : log-loss-score ==> 0.932386251670474
Fold 7 : log-loss-score ==> 0.9183424873191454
Fold 8 : log-loss-score ==> 0.9277506539219498
Fold 9 : log-loss-score ==> 0.913341151691919
Average log-loss of the RandomForest model over 10-folds is: 0.9239244843175729


In [None]:
## XGBoost:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
                
    ## Building LightGBM model
    model = XGBClassifier(n_estimators = 1500,
                          max_depth = 10,
                          learning_rate = 0.17,
                          gamma = 10, 
                          min_child_weight = 0, 
                          subsample = 0.85,
                          colsample_bytree = 0.9, 
                          objective = 'multi:softprob', 
                          eval_metric = 'mlogloss').fit(X_train, Y_train)  
    
    ## Predicting on X_val and X_test
    model_pred_val = model.predict_proba(X_val)
    model_pred_test = model.predict_proba(X_test)
        
    ## Computing log-loss
    score = log_loss(Y_val, model_pred_val)
    log_loss_scores.append(score)
    preds.append(model_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)
        
## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
xgb_cv_score = np.mean(cv_scores)    
print('Average log-loss of the XGBoost model over 10-folds is:', xgb_cv_score)

## Averaging LGBM model preds
xgb_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
xgb_preds_test.columns = model.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = xgb_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/xgb_submission.csv', index = False)

In [116]:
## LightGBM:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
                
    ## Building LightGBM model
    model = LGBMClassifier(n_estimators = 500,
                           max_depth = 4,
                           learning_rate = 0.06,
                           num_leaves = 56,
                           reg_alpha = 2.41, 
                           reg_lambda = 0.15, 
                           subsample = 0.95,
                           colsample_bytree = 0.6).fit(X_train, Y_train)    
    
    ## Predicting on X_val and X_test
    model_pred_val = model.predict_proba(X_val)
    model_pred_test = model.predict_proba(X_test)
        
    ## Computing log-loss
    score = log_loss(Y_val, model_pred_val)
    log_loss_scores.append(score)
    preds.append(model_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)
        
## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
lgbm_cv_score = np.mean(cv_scores)    
print('Average log-loss of the LightGBM model over 10-folds is:', lgbm_cv_score)

## Averaging LGBM model preds
lgbm_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
lgbm_preds_test.columns = model.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = lgbm_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/lgbm_submission.csv', index = False)

Fold 0 : log-loss-score ==> 0.8333140782022196
Fold 1 : log-loss-score ==> 0.8548253708988274
Fold 2 : log-loss-score ==> 0.828155422784764
Fold 3 : log-loss-score ==> 0.8298765912047754
Fold 4 : log-loss-score ==> 0.8463260161946797
Fold 5 : log-loss-score ==> 0.7972741876878633
Fold 6 : log-loss-score ==> 0.8358480748997852
Fold 7 : log-loss-score ==> 0.8298949852791104
Fold 8 : log-loss-score ==> 0.8496832769871117
Fold 9 : log-loss-score ==> 0.7988454532024684
Average log-loss of the LightGBM model over 10-folds is: 0.8304043457341604


In [117]:
## HistGradientBoosting:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
                
    ## Building LightGBM model
    model = HistGradientBoostingClassifier(max_iter = 500,
                                           max_depth = 9,
                                           learning_rate = 0.01,
                                           l2_regularization = 4.88).fit(X_train, Y_train)
    
    ## Predicting on X_val and X_test
    model_pred_val = model.predict_proba(X_val)
    model_pred_test = model.predict_proba(X_test)
        
    ## Computing log-loss
    score = log_loss(Y_val, model_pred_val)
    log_loss_scores.append(score)
    preds.append(model_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)
        
## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
hist_cv_score = np.mean(cv_scores)    
print('Average log-loss of the HistGB model over 10-folds is:', hist_cv_score)

## Averaging LGBM model preds
hist_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
hist_preds_test.columns = model.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = hist_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/hist_submission.csv', index = False)

Fold 0 : log-loss-score ==> 0.8424893908139456
Fold 1 : log-loss-score ==> 0.8424298589764635
Fold 2 : log-loss-score ==> 0.8251800581850407
Fold 3 : log-loss-score ==> 0.830131766785172
Fold 4 : log-loss-score ==> 0.8473630384624175
Fold 5 : log-loss-score ==> 0.7912545690098577
Fold 6 : log-loss-score ==> 0.829782083207241
Fold 7 : log-loss-score ==> 0.8140710439214494
Fold 8 : log-loss-score ==> 0.8439721214625762
Fold 9 : log-loss-score ==> 0.8041354775831653
Average log-loss of the HistGB model over 10-folds is: 0.8270809408407327


### Ensembling
Averaging predictions from different models using thier optimized hyper-parameter sets

In [118]:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
    
    ## Initializing val_preds list
    val_preds = list()
                
    ## Building LightGBM model
    model1 = HistGradientBoostingClassifier(max_iter = 500,
                                           max_depth = 9,
                                           learning_rate = 0.01,
                                           l2_regularization = 4.88).fit(X_train, Y_train)   
    
    model2 = LGBMClassifier(n_estimators = 500,
                           max_depth = 3,
                           learning_rate = 0.06,
                           num_leaves = 56,
                           reg_alpha = 2.41, 
                           reg_lambda = 0.15, 
                           subsample = 0.95,
                           colsample_bytree = 0.6).fit(X_train, Y_train)
    
    ## Predicting on X_val and X_test
    model1_pred_val = model1.predict_proba(X_val)
    model2_pred_val = model2.predict_proba(X_val)
    
    model1_pred_test = model1.predict_proba(X_test)
    model2_pred_test = model2.predict_proba(X_test)
    
    ## Averaging val predictions
    val_preds.append(model1_pred_val); val_preds.append(model2_pred_val)
    val_preds = pd.DataFrame(np.mean(val_preds, axis = 0))
    
    ## Computing log-loss
    score = log_loss(Y_val, val_preds)
    log_loss_scores.append(score)
    preds.append(model1_pred_test); preds.append(model2_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)
        
## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
ens_cv_score = np.mean(cv_scores)    
print('Average log-loss of the Ensemble model over 10-folds is:', ens_cv_score)

## Averaging LGBM model preds
ens_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
ens_preds_test.columns = model1.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = ens_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/ens_submission_day22.csv', index = False)

Fold 0 : log-loss-score ==> 0.8328656506418749
Fold 1 : log-loss-score ==> 0.8425179426385764
Fold 2 : log-loss-score ==> 0.8166961586069671
Fold 3 : log-loss-score ==> 0.8250684930343518
Fold 4 : log-loss-score ==> 0.846191722746808
Fold 5 : log-loss-score ==> 0.7926014928125634
Fold 6 : log-loss-score ==> 0.8304704176310442
Fold 7 : log-loss-score ==> 0.8169346491847175
Fold 8 : log-loss-score ==> 0.8402760108567965
Fold 9 : log-loss-score ==> 0.7992230035649801
Average log-loss of the Ensemble model over 10-folds is: 0.824284554171868


In [23]:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
    
    ## Initializing val_preds list
    val_preds = list()
                
    ## Building LightGBM model
    model1 = HistGradientBoostingClassifier(max_iter = 500,
                                           max_depth = 9,
                                           learning_rate = 0.01,
                                           l2_regularization = 4.88).fit(X_train, Y_train)   
    
    model2 = LGBMClassifier(n_estimators = 500,
                           max_depth = 3,
                           learning_rate = 0.06,
                           num_leaves = 56,
                           reg_alpha = 2.41, 
                           reg_lambda = 0.15, 
                           subsample = 0.95,
                           colsample_bytree = 0.6).fit(X_train, Y_train)
    
    model3 = XGBClassifier(n_estimators = 1500,
                          max_depth = 10,
                          learning_rate = 0.17,
                          gamma = 10, 
                          min_child_weight = 0, 
                          subsample = 0.85,
                          colsample_bytree = 0.9, 
                          objective = 'multi:softprob', 
                          eval_metric = 'mlogloss').fit(X_train, Y_train) 
    
    ## Predicting on X_val and X_test
    model1_pred_val = model1.predict_proba(X_val)
    model2_pred_val = model2.predict_proba(X_val)
    model3_pred_val = model3.predict_proba(X_val)
    
    model1_pred_test = model1.predict_proba(X_test)
    model2_pred_test = model2.predict_proba(X_test)
    model3_pred_test = model3.predict_proba(X_test)
    
    ## Averaging val predictions
    val_preds.append(model1_pred_val); val_preds.append(model2_pred_val); val_preds.append(model3_pred_val)
    val_preds = pd.DataFrame(np.mean(val_preds, axis = 0))
    
    ## Computing log-loss
    score = log_loss(Y_val, val_preds)
    log_loss_scores.append(score)
    preds.append(model1_pred_test); preds.append(model2_pred_test); preds.append(model3_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)
        
## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
ens_cv_score = np.mean(cv_scores)    
print('Average log-loss of the Ensemble model over 10-folds is:', ens_cv_score)

## Averaging LGBM model preds
ens_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
ens_preds_test.columns = model1.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = ens_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/ens2_submission_day2.csv', index = False)

Fold 0 : log-loss-score ==> 0.8243316195896278
Fold 1 : log-loss-score ==> 0.8313601521777878
Fold 2 : log-loss-score ==> 0.8126345050021551
Fold 3 : log-loss-score ==> 0.8185154893650244
Fold 4 : log-loss-score ==> 0.8350710542847061
Fold 5 : log-loss-score ==> 0.7876522561687304
Fold 6 : log-loss-score ==> 0.8182068088765823
Fold 7 : log-loss-score ==> 0.8084622346360046
Fold 8 : log-loss-score ==> 0.8291069809873695
Fold 9 : log-loss-score ==> 0.7962472261594239
Average log-loss of the Ensemble model over 10-folds is: 0.8161588327247411


In [None]:
cv_scores, log_loss_scores = list(), list()
preds = list()

skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    ## Splitting the data 
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
    
    ## Initializing val_preds list
    val_preds = list()
                
    ## Building LightGBM model
    model1 = HistGradientBoostingClassifier(max_iter = 300,
                                           max_depth = 12,
                                           learning_rate = 0.02,
                                           l2_regularization = 4.26).fit(X_train, Y_train)    
    
    model2 = LGBMClassifier(n_estimators = 400,
                           max_depth = 3,
                           learning_rate = 0.27,
                           num_leaves = 56,
                           reg_alpha = 9.96, 
                           reg_lambda = 0.85, 
                           subsample = 0.85,
                           colsample_bytree = 0.8).fit(X_train, Y_train) 
    
    model4 = RandomForestClassifier(n_estimators = 800,
                                   max_depth = 11,
                                   min_samples_split = 50,
                                   min_samples_leaf = 5).fit(X_train, Y_train)  
    
    ## Predicting on X_val and X_test
    model1_pred_val = model1.predict_proba(X_val)
    model2_pred_val = model2.predict_proba(X_val)
    #model3_pred_val = model3.predict_proba(X_val)
    model4_pred_val = model4.predict_proba(X_val)
    
    model1_pred_test = model1.predict_proba(X_test)
    model2_pred_test = model2.predict_proba(X_test)
    #model3_pred_test = model3.predict_proba(X_test)
    model4_pred_test = model3.predict_proba(X_test)
    
    ## Averaging val predictions
    val_preds.append(model1_pred_val); val_preds.append(model2_pred_val); val_preds.append(model4_pred_val)
    val_preds = pd.DataFrame(np.mean(val_preds, axis = 0))
    
    ## Computing log-loss
    score = log_loss(Y_val, val_preds)
    log_loss_scores.append(score)
    preds.append(model1_pred_test); preds.append(model2_pred_test); preds.append(model4_pred_test)
    print('Fold', i, ': log-loss-score ==>', score)
        
## Appending average cv scores
cv_scores.append(np.mean(log_loss_scores))
ens_cv_score = np.mean(cv_scores)    
print('Average log-loss of the Ensemble model over 10-folds is:', ens_cv_score)

## Averaging LGBM model preds
ens_preds_test = pd.DataFrame(np.mean(preds, axis = 0))

## Renaming columns by label class
ens_preds_test.columns = model1.classes_

## Creating submission file
sub[['high', 'medium', 'low']] = ens_preds_test[[2, 1, 0]]
sub.to_csv('Submissions/ens3_submission.csv', index = False)