# STAT 857 - W23 Project 2
## Evan Callaghan | April 17, 2023

### 1. Configuring setup
Installing packages and loading libraries

In [None]:
pip install lightgbm xgboost reverse_geocoder haversine optuna

In [32]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import optuna
from haversine import haversine
import reverse_geocoder as revgc
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

pd.set_option('display.max_columns', None, 'display.max_rows', None)

### 2. Data Exploration Section

enter brief description of this section here...

In [None]:
## Reading the data
train = pd.read_csv('Data/W23P2_train.csv')
test = pd.read_csv('Data/W23P2_test.csv')
sub = pd.read_csv('Data/Sample_Submission.csv')

In [None]:
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['interest_level'].value_counts()

In [None]:
train.describe()

#### Data Cleaning

In [None]:
## Removing training observations with prices higher than $40,000
train = train[train['price'] < 40000].reset_index(drop = True)

## Creating columns for outlier locations
train['location_outlier'] = np.where(train['latitude'] < 35, 1, 0)
test['location_outlier'] = np.where(test['latitude'] < 35, 1, 0)

## Log transformation on the price variable
train['price'] = np.log(train['price'])
test['price'] = np.log(test['price'])

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA().fit(X)

print(pca.explained_variance_ratio_)

In [None]:
X_pca = pca.components_

X_pca.head()

In [None]:
## Condensing redundant variables

laundry_vars = ['Laundry.in.Building', 'Laundry.in.Unit', 'Laundry.In.Building', 'Laundry.In.Unit', 'LAUNDRY', 'Washer.in.Unit', 
                'Dryer.in.Unit', 'Laundry.Room', 'Laundry', 'On.site.laundry', 'On.site.Laundry', 'Washer.Dryer', 'Washer.Dryer.in.building', 
                'In.Unit.Washer.Dryer', 'Washer...Dryer', 'Washer.Dryer.in.Unit']

parking_vars = ['Parking.Space', 'Garage', 'Parking', 'On.site.Garage', 'assigned.parking.space', 'Common.parking.Garage', 'Full.Service.Garage', 
               'On.site.Parking.Lot', 'Private.parking', 'Valet.Parking', 'Valet']

deck_vars = ['Roof.Deck', 'Balcony', 'Terrace', 'Patio', 'Roof.deck', 'balcony', 'terrace', 'patio', 'private.balcony', 'Private.balcony', 
             'Private.Deck', 'Common.roof.deck', 'ROOFDECK']

outdoor_vars = ['Courtyard', 'Outdoor.Entertainment.Space', 'Common.garden', 'Private.Outdoor.Space', 'private.outdoor.space', 'Private.outdoor.space', 
               'garden', 'Common.Outdoor.Space', 'PublicOutdoor', 'Garden.Patio', 'Outdoor.Space', 'Garden', 'Outdoor.Areas', 'Common.backyard', 
                'building.common.outdoor.space', 'Residents.Garden']

dishwasher_vars = ['Dishwasher', 'dishwasher']
gym_vars = ['Fitness.Center', 'Gym.Fitness', 'Health.Club', 'Gym', 'gym', 'Gym.In.Building']
pool_vars = ['Swimming.Pool', 'Pool', 'pool', 'Indoor.Pool']
elevator_vars = ['Elevator', 'elevator']
storage_vars = ['Storage', 'storage', 'Basement.Storage']
internet_vars = ['High.Speed.Internet', 'WiFi', 'WiFi.Access']
bike_vars = ['Bike.room', 'Bike.Room']
pet_friendly_vars = ['Dogs.Allowed', 'Cats.Allowed', 'Pet.Friendly', 'Pets.on.approval']
service_vars = ['Doorman', 'Concierge', 'Concierge.Service', 'Full.time.doorman', 'X24.7.Concierge', 'Virtual.Doorman', 'FT.Doorman', 'doorman']
super_vars = ['LIVE.IN.SUPER', 'Live.in.superintendent', 'Live.In.Superintendent','Live.in.Super', 'Live.In.Super']
hardwood_vars = ['Hardwood.Floors', 'HARDWOOD', 'Hardwood.floors', 'Hardwood']
ceiling_vars = ['High.ceilings', 'High.Ceilings', 'HIGH.CEILINGS', 'High.Ceiling']
brick_vars = ['EXPOSED.BRICK', 'Exposed.Brick']
construction_vars = ['New.Construction', 'Newly.renovated','Renovated', 'renovated', 'New.construction']
photo_vars = ['Actual.Apt..Photos', 'ACTUAL.APT..PHOTOS']
lounge_vars = ['Residents.Lounge', 'Lounge.room', 'Lounge']
playroom_vars = ['Childrens.Playroom', 'Children.s.Playroom']
ac_vars = ['Central.A.C', 'Air.conditioning']
kitchen_vars = ['EAT.IN.KITCHEN','Eat.In.Kitchen', 'Granite.Kitchen']
pricing_vars = ['No.Fee', 'NO.FEE']
accessibity_vars = ['Wheelchair.Ramp', 'Wheelchair.Access']
multi_level_vars = ['Multi.Level', 'Multi.level']
fireplace_vars = ['Fireplace', 'Decorative.Fireplace']
highrise_vars = ['Hi.Rise', 'HIGHRISE']
marble_bath_vars = ['Marble.Bath', 'Marble.Bathroom']
prewar_vars = ['Pre.War', 'prewar', 'Prewar']

## Condensing training data
train['Has_Laundry'] = np.where(np.sum(train[laundry_vars], axis = 1) > 0, 1, 0)
train['Has_Dishwasher'] = np.where(np.sum(train[dishwasher_vars], axis = 1) > 0, 1, 0)
train['Has_Gym'] = np.where(np.sum(train[gym_vars], axis = 1) > 0, 1, 0)
train['Has_pool'] = np.where(np.sum(train[pool_vars], axis = 1) > 0, 1, 0)
train['Has_Elevator'] = np.where(np.sum(train[elevator_vars], axis = 1) > 0, 1, 0)
train['Has_storage'] = np.where(np.sum(train[storage_vars], axis = 1) > 0, 1, 0)
train['Has_Wifi'] = np.where(np.sum(train[internet_vars], axis = 1) > 0, 1, 0)
train['Has_Bike'] = np.where(np.sum(train[bike_vars], axis = 1) > 0, 1, 0)
train['Has_Parking'] = np.where(np.sum(train[parking_vars], axis = 1) > 0, 1, 0)
train['Has_Deck'] = np.where(np.sum(train[deck_vars], axis = 1) > 0, 1, 0)
train['Has_Lounge'] = np.where(np.sum(train[lounge_vars], axis = 1) > 0, 1, 0)
train['Has_Playroom'] = np.where(np.sum(train[playroom_vars], axis = 1) > 0, 1, 0)
train['Has_AC'] = np.where(np.sum(train[ac_vars], axis = 1) > 0, 1, 0)
train['Has_Kitchen'] = np.where(np.sum(train[kitchen_vars], axis = 1) > 0, 1, 0)
train['No_Fee'] = np.where(np.sum(train[pricing_vars], axis = 1) > 0, 1, 0)
train['Outdoor_Area'] = np.where(np.sum(train[outdoor_vars], axis = 1) > 0, 1, 0)
train['Pet_Friendly'] = np.where(np.sum(train[pet_friendly_vars], axis = 1) > 0, 1, 0)
train['Has_Services'] = np.where(np.sum(train[service_vars], axis = 1) > 0, 1, 0)
train['Super.'] = np.where(np.sum(train[super_vars], axis = 1) > 0, 1, 0)
train['Hardwood_Floor'] = np.where(np.sum(train[hardwood_vars], axis = 1) > 0, 1, 0)
train['High_Ceilings'] = np.where(np.sum(train[ceiling_vars], axis = 1) > 0, 1, 0)
train['Brick'] = np.where(np.sum(train[brick_vars], axis = 1) > 0, 1, 0)
train['Newly_Renovated'] = np.where(np.sum(train[construction_vars], axis = 1) > 0, 1, 0)
train['Has_Photos'] = np.where(np.sum(train[photo_vars], axis = 1) > 0, 1, 0)
train['Accessible'] = np.where(np.sum(train[accessibity_vars], axis = 1) > 0, 1, 0)
train['Multi_Level'] = np.where(np.sum(train[multi_level_vars], axis = 1) > 0, 1, 0)
train['Fire'] = np.where(np.sum(train[fireplace_vars], axis = 1) > 0, 1, 0)
train['Highrise'] = np.where(np.sum(train[highrise_vars], axis = 1) > 0, 1, 0)
train['Marble_Bath'] = np.where(np.sum(train[marble_bath_vars], axis = 1) > 0, 1, 0)
train['Pre_War'] = np.where(np.sum(train[prewar_vars], axis = 1) > 0, 1, 0)

## Condensing testing data
test['Has_Laundry'] = np.where(np.sum(test[laundry_vars], axis = 1) > 0, 1, 0)
test['Has_Dishwasher'] = np.where(np.sum(test[dishwasher_vars], axis = 1) > 0, 1, 0)
test['Has_Gym'] = np.where(np.sum(test[gym_vars], axis = 1) > 0, 1, 0)
test['Has_pool'] = np.where(np.sum(test[pool_vars], axis = 1) > 0, 1, 0)
test['Has_Elevator'] = np.where(np.sum(test[elevator_vars], axis = 1) > 0, 1, 0)
test['Has_storage'] = np.where(np.sum(test[storage_vars], axis = 1) > 0, 1, 0)
test['Has_Wifi'] = np.where(np.sum(test[internet_vars], axis = 1) > 0, 1, 0)
test['Has_Bike'] = np.where(np.sum(test[bike_vars], axis = 1) > 0, 1, 0)
test['Has_Parking'] = np.where(np.sum(test[parking_vars], axis = 1) > 0, 1, 0)
test['Has_Deck'] = np.where(np.sum(test[deck_vars], axis = 1) > 0, 1, 0)
test['Has_Lounge'] = np.where(np.sum(test[lounge_vars], axis = 1) > 0, 1, 0)
test['Has_Playroom'] = np.where(np.sum(test[playroom_vars], axis = 1) > 0, 1, 0)
test['Has_AC'] = np.where(np.sum(test[ac_vars], axis = 1) > 0, 1, 0)
test['Has_Kitchen'] = np.where(np.sum(test[kitchen_vars], axis = 1) > 0, 1, 0)
test['No_Fee'] = np.where(np.sum(test[pricing_vars], axis = 1) > 0, 1, 0)
test['Outdoor_Area'] = np.where(np.sum(test[outdoor_vars], axis = 1) > 0, 1, 0)
test['Pet_Friendly'] = np.where(np.sum(test[pet_friendly_vars], axis = 1) > 0, 1, 0)
test['Has_Services'] = np.where(np.sum(test[service_vars], axis = 1) > 0, 1, 0)
test['Super.'] = np.where(np.sum(test[super_vars], axis = 1) > 0, 1, 0)
test['Hardwood_Floor'] = np.where(np.sum(test[hardwood_vars], axis = 1) > 0, 1, 0)
test['High_Ceilings'] = np.where(np.sum(test[ceiling_vars], axis = 1) > 0, 1, 0)
test['Brick'] = np.where(np.sum(test[brick_vars], axis = 1) > 0, 1, 0)
test['Newly_Renovated'] = np.where(np.sum(test[construction_vars], axis = 1) > 0, 1, 0)
test['Has_Photos'] = np.where(np.sum(test[photo_vars], axis = 1) > 0, 1, 0)
test['Accessible'] = np.where(np.sum(test[accessibity_vars], axis = 1) > 0, 1, 0)
test['Multi_Level'] = np.where(np.sum(test[multi_level_vars], axis = 1) > 0, 1, 0)
test['Fire'] = np.where(np.sum(test[fireplace_vars], axis = 1) > 0, 1, 0)
test['Highrise'] = np.where(np.sum(test[highrise_vars], axis = 1) > 0, 1, 0)
test['Marble_Bath'] = np.where(np.sum(test[marble_bath], axis = 1) > 0, 1, 0)
test['Pre_War'] = np.where(np.sum(test[prewar_vars], axis = 1) > 0, 1, 0)

to_drop = [laundry_vars, dishwasher_vars, gym_vars, pool_vars, elevator_vars, storage_vars, internet_vars, bike_vars, 
           parking_vars, deck_vars, lounge_vars, playroom_vars, ac_vars, kitchen_vars, pricing_vars, outdoor_vars, 
           pet_friendly_vars, service_vars, super_vars, hardwood_vars, ceiling_vars, brick_vars, construction_vars, photo_vars, 
           accessibity_vars, multi_level_vars, fireplace_vars, highrise_vars, marble_bath, prewar_vars] 

for cols in to_drop:
    train = train.drop(columns = cols)
    test = test.drop(columns = cols)

#### Variable Engineering

In [None]:
amenities = ['Has_Laundry', 'Has_Dishwasher', 'Has_Gym', 'Has_pool', 'Has_Elevator', 'Has_storage', 'Has_Wifi', 'Has_Bike', 
             'Has_Parking', 'Has_Deck', 'Has_Lounge', 'Has_Playroom', 'Has_AC', 'Has_Kitchen', 'Outdoor_Area', 'Furnished', 
             'Stainless.Steel.Appliances', 'Cable.Satellite.TV', 'Microwave', 'Sauna']

X_train_amenities = train[amenities]
Y_train_amenities = train['interest_level']

X_test_amenities = test[amenities]

## Building the K-NN amentity model
knn_amentity_md = KNeighborsClassifier(n_neighbors = 15).fit(X_train_amenities, Y_train_amenities)

## Predicting on the training and testing sets
train_preds = knn_amentity_md.predict_proba(X_train_amenities)
test_preds = knn_amentity_md.predict_proba(X_test_amenities)

## Extracting the likelihood of high interest
train['Amenity_Score'] = pd.DataFrame(train_preds)[0]
test['Amenity_Score'] = pd.DataFrame(test_preds)[0]

In [None]:
qualities = ['Exclusive', 'View', 'Green.Building', 'Light', 'Skylight', 'Walk.in.Closet.s.', 'Luxury.building', 
             'Hardwood_Floor', 'High_Ceilings', 'Brick', 'Newly_Renovated', 'Multi_Level', 'Highrise', 'Marble_Bath', 'Fire']

X_train_qualities = train[qualities]
Y_train_qualities = train['interest_level']

X_test_qualities = test[qualities]

## Building the K-NN amentity model
knn_quality_md = KNeighborsClassifier(n_neighbors = 15).fit(X_train_qualities, Y_train_qualities)

## Predicting on the training and testing sets
train_preds = knn_quality_md.predict_proba(X_train_qualities)
test_preds = knn_quality_md.predict_proba(X_test_qualities)

## Extracting the likelihood of high interest
train['Quality_Score'] = pd.DataFrame(train_preds)[0]
test['Quality_Score'] = pd.DataFrame(test_preds)[0]

In [None]:
## Computing distance from unit to Central Park
def central_park(data):
    data['distance'] = np.nan
    for i in range(data.shape[0]):
        lat = data.at[i, 'latitude']; long = data.at[i, 'longitude']
        data.at[i, 'distance'] = haversine((lat, long), (40.781179, -73.966098), unit = 'mi')
    return data

## Applying function
train = central_park(train)
test = central_park(test)

In [None]:
## Price per bedroom
rooms_train = train['bedrooms'].apply(lambda x: max(x, 0.5))
train['price_per_bedroom'] = train['price'] / rooms_train

rooms_test = test['bedrooms'].apply(lambda x: max(x, 0.5))
test['price_per_bedroom'] = test['price'] / rooms_test

In [None]:
## Standardizing numeric variables
def min_max_scaler(data, minimun, maximum):
    data_scaled = (data - minimun) / (maximum - minimun)
    return data_scaled

train['price'] = min_max_scaler(train['price'], np.min(train['price']), np.max(train['price']))
train['bathrooms'] = min_max_scaler(train['bathrooms'], np.min(train['bathrooms']), np.max(train['bathrooms']))
train['bedrooms'] = min_max_scaler(train['bedrooms'], np.min(train['bedrooms']), np.max(train['bedrooms']))
train['distance'] = min_max_scaler(train['distance'], np.min(train['distance']), np.max(train['distance']))
train['price_per_bedroom'] = min_max_scaler(train['price_per_bedroom'], np.min(train['price_per_bedroom']), np.max(train['price_per_bedroom']))
train['latitude'] = min_max_scaler(train['latitude'], np.min(train['latitude']), np.max(train['latitude']))
train['longitude'] = min_max_scaler(train['longitude'], np.min(train['longitude']), np.max(train['longitude']))


test['price'] = min_max_scaler(test['price'], np.min(train['price']), np.max(train['price']))
test['bathrooms'] = min_max_scaler(test['bathrooms'], np.min(train['bathrooms']), np.max(train['bathrooms']))
test['bedrooms'] = min_max_scaler(test['bedrooms'], np.min(train['bedrooms']), np.max(train['bedrooms']))
test['distance'] = min_max_scaler(test['distance'], np.min(train['distance']), np.max(train['distance']))
test['price_per_bedroom'] = min_max_scaler(test['price_per_bedroom'], np.min(train['price_per_bedroom']), np.max(train['price_per_bedroom']))
test['latitude'] = min_max_scaler(test['latitude'], np.min(train['latitude']), np.max(train['latitude']))
test['longitude'] = min_max_scaler(test['longitude'], np.min(train['longitude']), np.max(train['longitude']))

In [None]:
train = train[['interest_level', 'bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'Dining.Room', 'Reduced.Fee', 'Exclusive', 
               'LOWRISE', 'SIMPLEX', 'Furnished', 'Loft', 'Stainless.Steel.Appliances', 'View', 'Green.Building', 'Short.Term.Allowed', 
               'Subway', 'Light', 'Guarantors.Accepted', 'Skylight', 'Sauna', 'Live.Work', 'Duplex', 'Walk.in.Closet.s.', 'Luxury.building', 
               'Post.War', 'Cable.Satellite.TV', 'Microwave', 'Sublet', 'Shares.OK', 'Has_Laundry', 'Has_Dishwasher', 'Has_Gym', 'Has_pool', 
               'Has_Elevator', 'Has_storage', 'Has_Wifi', 'Has_Bike', 'Has_Parking', 'Has_Deck', 'Has_Lounge', 'Has_Playroom', 'Has_AC', 
               'Has_Kitchen', 'No_Fee', 'Outdoor_Area', 'Pet_Friendly', 'Has_Services', 'Super.', 'Hardwood_Floor', 'High_Ceilings', 'Brick', 
               'Newly_Renovated', 'Has_Photos', 'Accessible', 'Multi_Level', 'Fire', 'Highrise', 'Marble_Bath', 'Pre_War', 'Amenity_Score', 
               'Quality_Score', 'distance', 'price_per_bedroom', 'location_outlier']]

test = test[['ID', 'bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'Dining.Room', 'Reduced.Fee', 'Exclusive', 
             'LOWRISE', 'SIMPLEX', 'Furnished', 'Loft', 'Stainless.Steel.Appliances', 'View', 'Green.Building', 'Short.Term.Allowed', 
             'Subway', 'Light', 'Guarantors.Accepted', 'Skylight', 'Sauna', 'Live.Work', 'Duplex', 'Walk.in.Closet.s.', 'Luxury.building', 
             'Post.War', 'Cable.Satellite.TV', 'Microwave', 'Sublet', 'Shares.OK', 'Has_Laundry', 'Has_Dishwasher', 'Has_Gym', 'Has_pool', 
             'Has_Elevator', 'Has_storage', 'Has_Wifi', 'Has_Bike', 'Has_Parking', 'Has_Deck', 'Has_Lounge', 'Has_Playroom', 'Has_AC', 
             'Has_Kitchen', 'No_Fee', 'Outdoor_Area', 'Pet_Friendly', 'Has_Services', 'Super.', 'Hardwood_Floor', 'High_Ceilings', 'Brick', 
             'Newly_Renovated', 'Has_Photos', 'Accessible', 'Multi_Level', 'Fire', 'Highrise', 'Marble_Bath', 'Pre_War', 'Amenity_Score', 
             'Quality_Score', 'distance', 'price_per_bedroom', 'location_outlier']]

In [None]:
train.head()

#### Splitting the training data into train and validation sets

In [None]:
## Defining input and target variables
X = train.drop(columns = ['interest_level'])
Y = train['interest_level']

## Splitting the data
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 42)

## Saving new csv files
pd.concat([X_train, Y_train], axis = 1).to_csv('Data/Training.csv', index = False)
pd.concat([X_validation, Y_validation], axis = 1).to_csv('Data/Validation.csv', index = False)
test.to_csv('Data/Testing.csv', index = False)

#### Feature Selection

In [3]:
## Using RFE with RandomForestClassifer to identify most important features
def flat_list(my_list):
    
    ## Defining list to store results
    out_list = list()
    for i in my_list:
        out_list += i
    return out_list

def RF_RFE_rep_cross_val(X, Y, numb_folds, max_features, numb_reps):
    
    ## Defining list to store results
    RFE_rep_results = list()
    for i in range(2, max_features):
        RFE_rep_results.append(RF_rep_cross_val(X, Y, numb_folds, i, numb_reps))
        print('Features -->', i) ## Sanity check
    return RFE_rep_results

def RF_rep_cross_val(X, Y, numb_folds, numb_features, numb_reps):
    
    ## Defining the list to store results
    rep_results = list()
    for i in range(0, numb_reps):
        rep_results.append(RF_cross_val(X, Y, numb_folds, numb_features))
    return flat_list(rep_results)

def RF_cross_val(X, Y, numb_folds, numb_features):
    
    ## Defining list to store results
    results = list()
    
    ## Defining the number of folds
    kf = KFold(n_splits = numb_folds, shuffle = True)
    
    for train_index, test_index in kf.split(X):
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        ## Running RFE with i features
        RF_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 100, max_depth = 5), 
                     n_features_to_select = numb_features).fit(X_train, Y_train)
        
        ## Variables to be considered
        to_select = X_train.columns[RF_rfe.support_]
        to_select_list.append(RF_rfe.support_)
        
        ## Building the Random Forest model
        X_train_md = X_train[to_select]
        X_test_md = X_test[to_select]
        
        RF_md = RandomForestClassifier(n_estimators = 100, max_depth = 5).fit(X_train_md, Y_train)
        
        ## Predicting on the test data-frame and storing RMSE
        results.append(log_loss(Y_test, RF_md.predict_proba(X_test_md)))

    return results

## Defining list to store results
to_select_list = list()

## Defining input and target variables
data = pd.read_csv('Data/Training.csv')
X = data.drop(columns = ['interest_level']); Y = data['interest_level']

## Running RFE to estimate number of features to be selected
RFE_numb_features = RF_RFE_rep_cross_val(X, Y, numb_folds = 3, max_features = 21, numb_reps = 1)

Features --> 2
Features --> 3
Features --> 4
Features --> 5
Features --> 6
Features --> 7
Features --> 8
Features --> 9
Features --> 10
Features --> 11
Features --> 12
Features --> 13
Features --> 14
Features --> 15
Features --> 16
Features --> 17
Features --> 18
Features --> 19
Features --> 20


In [4]:
## Identifying features
features = pd.DataFrame(to_select_list)
features.columns = X.columns
feature_selections = 100 * features.apply(np.sum, axis = 0) / features.shape[0]
feature_selections = pd.DataFrame(feature_selections).reset_index(drop = False)

## Model performance given the number of variables
feature_performance = pd.DataFrame(RFE_numb_features)
feature_performance.columns = [['Split_1', 'Split_2', 'Split_3']]
feature_performance['Mean'] = feature_performance.apply(np.mean, axis = 1)
feature_performance['Num_features'] = feature_performance.index + 2

In [5]:
feature_performance

Unnamed: 0,Split_1,Split_2,Split_3,Mean,Num_features
0,0.949179,0.955135,0.934292,0.946202,2
1,0.928257,0.918513,0.950921,0.932563,3
2,0.926365,0.933963,0.919515,0.926614,4
3,0.926321,0.925702,0.939017,0.930347,5
4,0.930575,0.927016,0.930899,0.929497,6
5,0.920819,0.940219,0.948744,0.936594,7
6,0.93895,0.92948,0.948464,0.938965,8
7,0.930296,0.922026,0.931304,0.927876,9
8,0.927591,0.936611,0.922728,0.928977,10
9,0.927442,0.938074,0.930882,0.932133,11


In [6]:
feature_selections.sort_values(0, ascending = False).head(10)

Unnamed: 0,index,0
4,price,100.0
63,price_per_bedroom,100.0
44,No_Fee,89.473684
62,distance,89.473684
3,longitude,80.701754
61,Quality_Score,77.192982
2,latitude,71.929825
49,Hardwood_Floor,71.929825
60,Amenity_Score,70.175439
1,bedrooms,57.894737


The model which used the top nine features had the lowest cross-validation score, so we will consider the top nine features for the final modelling processes.

#### Hyper-Parameter Tuning

In [25]:
## Reading the data
train = pd.read_csv('Data/Training.csv')
val = pd.read_csv('Data/Validation.csv')

## Defining the input and target variables
X_train = train[['price', 'price_per_bedroom', 'No_Fee', 'distance', 'longitude', 'Quality_Score', 'latitude', 'Hardwood_Floor', 'Amenity_Score']]
X_validation = val[['price', 'price_per_bedroom', 'No_Fee', 'distance', 'longitude', 'Quality_Score', 'latitude', 'Hardwood_Floor', 'Amenity_Score']]

Y_train = train['interest_level']
Y_validation = val['interest_level']

## Adjusting target variable
Y_train = np.where(Y_train == 'low', 0, np.where(Y_train == 'medium', 1, 2))
Y_validation = np.where(Y_validation == 'low', 0, np.where(Y_validation == 'medium', 1, 2))

In [26]:
def rf_objective(trial):

    ## Defining the XGBoost hyper-parameter grid
    rf_param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 50),
                     'max_depth': trial.suggest_int('max_depth', 3, 12), 
                     'min_samples_split': trial.suggest_int('min_samples_split', 2, 20), 
                     'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20), 
                     'random_state': trial.suggest_int('random_state', 1, 500),
                     'max_features': trial.suggest_categorical('max_features', ['sqrt', None])
                    }
    
    ## Building the model
    rf_md = RandomForestClassifier(**rf_param_grid, criterion = 'log_loss', n_jobs = -1).fit(X_train, Y_train)
    
    ## Predicting on the test data-frame
    rf_md_preds = rf_md.predict_proba(X_validation)
    
    ## Evaluating model performance on the test set
    rf_md_error = log_loss(Y_validation, rf_md_preds)
    
    return rf_md_error

def xgb_objective(trial):

    ## Defining the XGBoost hyper-parameter grid
    xgboost_param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 50), 
                          'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01), 
                          'max_depth': trial.suggest_int('max_depth', 3, 15), 
                          'gamma': trial.suggest_float('gamma', 0, 100, step = 5), 
                          'min_child_weight': trial.suggest_int('min_child_weight', 0, 300, 20), 
                          'subsample': trial.suggest_float('subsample', 0.6, 1, step = 0.05), 
                          'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1, step = 0.05),
                          'seed': trial.suggest_int('seed', 1, 1000)
                         }
    
    ## Building the model
    xgb_md = XGBClassifier(**xgboost_param_grid, n_jobs = -1, booster = 'gbtree', tree_method = 'hist').fit(X_train, Y_train)
    
    ## Predicting on the test data-frame
    xgb_md_preds = xgb_md.predict_proba(X_validation)
    
    ## Evaluating model performance on the test set
    xgb_md_error = log_loss(Y_validation, xgb_md_preds)
    
    return xgb_md_error

def lgbm_objective(trial):
    
    ## Defining the LGB hyper-parameter grid
    LGB_param_grid = {'boosting_type': 'gbdt',
                      'n_estimators': trial.suggest_int('n_estimators', 100, 1500, 100),
                      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),
                      'num_leaves': trial.suggest_int('num_leaves', 5, 40, step = 1),
                      'max_depth': trial.suggest_int('max_depth', 3, 12),
                      'subsample': trial.suggest_float('subsample', 0.7, 1, step = 0.01), 
                      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1, step = 0.01),
                      'random_state': trial.suggest_int('random_state', 1, 1000),
                      'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5, step = 0.1),
                      'reg_lambda': trial.suggest_float('reg_lambda', 0, 0.5, step = 0.1), 
                      'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 0, 300, step = 20),
                      'objective': 'log_loss', 
                      'verbosity': -1
                     }
                     
    ## Building the LightGBM model
    model = LGBMClassifier(**LGB_param_grid, n_jobs = -1).fit(X_train, Y_train)
        
    ## Predicting on the test data-frame
    lgbm_md_preds = model.predict_proba(X_validation)
    
    ## Evaluating model performance on the test set
    lgbm_md_error = log_loss(Y_validation, lgbm_md_preds)
    
    return lgbm_md_error

def hist_objective(trial):

    ## Defining the XGBoost hyper-parameter grid
    hist_param_grid = {'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),
                       'max_iter': trial.suggest_int('max_iter', 100, 1000, 50),
                       'max_depth': trial.suggest_int('max_depth', 3, 15), 
                       'l2_regularization': trial.suggest_float('l2_regularization', 0, 0.1, step = 0.002),
                       'random_state': trial.suggest_int('random_state', 1, 500),
                      }
    
    ## Building the model
    hist_md = HistGradientBoostingClassifier(**hist_param_grid, loss = 'log_loss', early_stopping = True).fit(X_train, Y_train)
    
    ## Predicting on the test data-frame
    hist_md_preds = hist_md.predict_proba(X_validation)
    
    ## Evaluating model performance on the test set
    hist_md_error = log_loss(Y_validation, hist_md_preds)
    
    return hist_md_error

In [27]:
## Starting RandomForest
## ----
## Creating a study object and to optimize the home objective function
study_rf = optuna.create_study(direction = 'minimize')
study_rf.optimize(rf_objective, n_trials = 100)

## Starting XGBoost
## ----
## Creating a study object and to optimize the home objective function
study_xgb = optuna.create_study(direction = 'minimize')
study_xgb.optimize(xgb_objective, n_trials = 100)

## Starting LightGBM
## ----
## Creating a study object and to optimize the home objective function
study_lgbm = optuna.create_study(direction = 'minimize')
study_lgbm.optimize(lgbm_objective, n_trials = 100)

## Starting HistGradientBoosting
## ----
## Creating a study object and to optimize the home objective function
study_hist = optuna.create_study(direction = 'minimize')
study_hist.optimize(hist_objective, n_trials = 100)

[32m[I 2023-04-13 01:56:51,511][0m A new study created in memory with name: no-name-0477dde0-e57f-49a8-ae3f-799231af470a[0m
[32m[I 2023-04-13 01:56:55,456][0m Trial 0 finished with value: 0.9301509652490536 and parameters: {'n_estimators': 850, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 3, 'random_state': 484, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9301509652490536.[0m
[32m[I 2023-04-13 01:57:08,422][0m Trial 1 finished with value: 0.8761740989830079 and parameters: {'n_estimators': 1000, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 20, 'random_state': 307, 'max_features': None}. Best is trial 1 with value: 0.8761740989830079.[0m
[32m[I 2023-04-13 01:57:11,479][0m Trial 2 finished with value: 0.879035776394069 and parameters: {'n_estimators': 200, 'max_depth': 11, 'min_samples_split': 17, 'min_samples_leaf': 8, 'random_state': 452, 'max_features': None}. Best is trial 1 with value: 0.8761740989830079.[0m
[32m[I 2023-04-13 01



[32m[I 2023-04-13 02:06:34,386][0m Trial 0 finished with value: 0.8728745609302276 and parameters: {'n_estimators': 1500, 'learning_rate': 0.08, 'num_leaves': 19, 'max_depth': 4, 'subsample': 0.85, 'colsample_bytree': 0.77, 'random_state': 153, 'reg_alpha': 0.5, 'reg_lambda': 0.1, 'min_data_in_leaf': 240}. Best is trial 0 with value: 0.8728745609302276.[0m




[32m[I 2023-04-13 02:06:36,937][0m Trial 1 finished with value: 0.9905444721047474 and parameters: {'n_estimators': 1400, 'learning_rate': 0.11, 'num_leaves': 15, 'max_depth': 10, 'subsample': 0.7, 'colsample_bytree': 0.75, 'random_state': 284, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'min_data_in_leaf': 80}. Best is trial 0 with value: 0.8728745609302276.[0m




[32m[I 2023-04-13 02:06:43,208][0m Trial 2 finished with value: 0.9750480959918832 and parameters: {'n_estimators': 1500, 'learning_rate': 0.14, 'num_leaves': 16, 'max_depth': 12, 'subsample': 0.77, 'colsample_bytree': 0.86, 'random_state': 715, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 300}. Best is trial 0 with value: 0.8728745609302276.[0m




[32m[I 2023-04-13 02:06:48,284][0m Trial 3 finished with value: 0.8632449240451265 and parameters: {'n_estimators': 1300, 'learning_rate': 0.02, 'num_leaves': 20, 'max_depth': 9, 'subsample': 0.98, 'colsample_bytree': 0.95, 'random_state': 644, 'reg_alpha': 0.0, 'reg_lambda': 0.1, 'min_data_in_leaf': 260}. Best is trial 3 with value: 0.8632449240451265.[0m




[32m[I 2023-04-13 02:06:48,560][0m Trial 4 finished with value: 0.8591791049858457 and parameters: {'n_estimators': 100, 'learning_rate': 0.13, 'num_leaves': 40, 'max_depth': 9, 'subsample': 0.72, 'colsample_bytree': 0.9199999999999999, 'random_state': 478, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.4, 'min_data_in_leaf': 200}. Best is trial 4 with value: 0.8591791049858457.[0m




[32m[I 2023-04-13 02:06:48,800][0m Trial 5 finished with value: 0.8578949042343931 and parameters: {'n_estimators': 200, 'learning_rate': 0.24000000000000002, 'num_leaves': 16, 'max_depth': 3, 'subsample': 0.7999999999999999, 'colsample_bytree': 0.99, 'random_state': 417, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'min_data_in_leaf': 260}. Best is trial 5 with value: 0.8578949042343931.[0m
[32m[I 2023-04-13 02:06:48,981][0m Trial 6 finished with value: 0.8646811545308712 and parameters: {'n_estimators': 100, 'learning_rate': 0.29000000000000004, 'num_leaves': 12, 'max_depth': 6, 'subsample': 0.8799999999999999, 'colsample_bytree': 0.8999999999999999, 'random_state': 422, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.4, 'min_data_in_leaf': 220}. Best is trial 5 with value: 0.8578949042343931.[0m




[32m[I 2023-04-13 02:06:49,632][0m Trial 7 finished with value: 0.8565190562602114 and parameters: {'n_estimators': 300, 'learning_rate': 0.06999999999999999, 'num_leaves': 32, 'max_depth': 7, 'subsample': 0.7999999999999999, 'colsample_bytree': 0.98, 'random_state': 579, 'reg_alpha': 0.5, 'reg_lambda': 0.2, 'min_data_in_leaf': 280}. Best is trial 7 with value: 0.8565190562602114.[0m




[32m[I 2023-04-13 02:06:50,094][0m Trial 8 finished with value: 0.8580803687106049 and parameters: {'n_estimators': 400, 'learning_rate': 0.15000000000000002, 'num_leaves': 5, 'max_depth': 12, 'subsample': 0.96, 'colsample_bytree': 0.98, 'random_state': 377, 'reg_alpha': 0.2, 'reg_lambda': 0.0, 'min_data_in_leaf': 40}. Best is trial 7 with value: 0.8565190562602114.[0m




[32m[I 2023-04-13 02:06:52,913][0m Trial 9 finished with value: 0.8610605824302529 and parameters: {'n_estimators': 1100, 'learning_rate': 0.02, 'num_leaves': 27, 'max_depth': 11, 'subsample': 0.72, 'colsample_bytree': 0.73, 'random_state': 865, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'min_data_in_leaf': 240}. Best is trial 7 with value: 0.8565190562602114.[0m




[32m[I 2023-04-13 02:06:54,195][0m Trial 10 finished with value: 0.9531769769636003 and parameters: {'n_estimators': 600, 'learning_rate': 0.23, 'num_leaves': 31, 'max_depth': 6, 'subsample': 0.8999999999999999, 'colsample_bytree': 0.82, 'random_state': 940, 'reg_alpha': 0.5, 'reg_lambda': 0.2, 'min_data_in_leaf': 140}. Best is trial 7 with value: 0.8565190562602114.[0m




[32m[I 2023-04-13 02:06:54,736][0m Trial 11 finished with value: 0.8621729122386842 and parameters: {'n_estimators': 400, 'learning_rate': 0.22, 'num_leaves': 35, 'max_depth': 3, 'subsample': 0.7899999999999999, 'colsample_bytree': 0.99, 'random_state': 22, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 300}. Best is trial 7 with value: 0.8565190562602114.[0m




[32m[I 2023-04-13 02:06:56,438][0m Trial 12 finished with value: 0.9568629672146622 and parameters: {'n_estimators': 800, 'learning_rate': 0.21000000000000002, 'num_leaves': 26, 'max_depth': 6, 'subsample': 0.7999999999999999, 'colsample_bytree': 0.94, 'random_state': 643, 'reg_alpha': 0.1, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 160}. Best is trial 7 with value: 0.8565190562602114.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:06:56,947][0m Trial 13 finished with value: 0.8695815949584084 and parameters: {'n_estimators': 300, 'learning_rate': 0.3, 'num_leaves': 8, 'max_depth': 4, 'subsample': 0.82, 'colsample_bytree': 1.0, 'random_state': 570, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 160}. Best is trial 7 with value: 0.8565190562602114.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:06:58,388][0m Trial 14 finished with value: 0.8673367760739051 and parameters: {'n_estimators': 700, 'learning_rate': 0.06999999999999999, 'num_leaves': 25, 'max_depth': 7, 'subsample': 0.76, 'colsample_bytree': 0.8899999999999999, 'random_state': 251, 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'min_data_in_leaf': 300}. Best is trial 7 with value: 0.8565190562602114.[0m




[32m[I 2023-04-13 02:06:58,819][0m Trial 15 finished with value: 0.8545181260698982 and parameters: {'n_estimators': 300, 'learning_rate': 0.18000000000000002, 'num_leaves': 33, 'max_depth': 3, 'subsample': 0.84, 'colsample_bytree': 0.8099999999999999, 'random_state': 778, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'min_data_in_leaf': 100}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:01,374][0m Trial 16 finished with value: 1.0873931191496056 and parameters: {'n_estimators': 1000, 'learning_rate': 0.18000000000000002, 'num_leaves': 34, 'max_depth': 8, 'subsample': 0.9199999999999999, 'colsample_bytree': 0.7999999999999999, 'random_state': 816, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'min_data_in_leaf': 100}. Best is trial 15 with value: 0.8545181260698982.[0m




[32m[I 2023-04-13 02:07:02,525][0m Trial 17 finished with value: 0.8858022819266831 and parameters: {'n_estimators': 500, 'learning_rate': 0.08, 'num_leaves': 39, 'max_depth': 5, 'subsample': 0.86, 'colsample_bytree': 0.7, 'random_state': 992, 'reg_alpha': 0.4, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 20}. Best is trial 15 with value: 0.8545181260698982.[0m




[32m[I 2023-04-13 02:07:04,899][0m Trial 18 finished with value: 1.0877291434713594 and parameters: {'n_estimators': 900, 'learning_rate': 0.19, 'num_leaves': 31, 'max_depth': 8, 'subsample': 0.83, 'colsample_bytree': 0.82, 'random_state': 784, 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'min_data_in_leaf': 100}. Best is trial 15 with value: 0.8545181260698982.[0m




[32m[I 2023-04-13 02:07:06,512][0m Trial 19 finished with value: 0.8827540740281509 and parameters: {'n_estimators': 600, 'learning_rate': 0.05, 'num_leaves': 30, 'max_depth': 5, 'subsample': 0.9299999999999999, 'colsample_bytree': 0.87, 'random_state': 574, 'reg_alpha': 0.5, 'reg_lambda': 0.4, 'min_data_in_leaf': 0}. Best is trial 15 with value: 0.8545181260698982.[0m




[32m[I 2023-04-13 02:07:07,258][0m Trial 20 finished with value: 0.8650043356487022 and parameters: {'n_estimators': 300, 'learning_rate': 0.11, 'num_leaves': 36, 'max_depth': 7, 'subsample': 0.75, 'colsample_bytree': 0.84, 'random_state': 735, 'reg_alpha': 0.1, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 180}. Best is trial 15 with value: 0.8545181260698982.[0m




[32m[I 2023-04-13 02:07:07,582][0m Trial 21 finished with value: 0.8612120322041307 and parameters: {'n_estimators': 200, 'learning_rate': 0.26, 'num_leaves': 21, 'max_depth': 3, 'subsample': 0.8099999999999999, 'colsample_bytree': 0.96, 'random_state': 526, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'min_data_in_leaf': 60}. Best is trial 15 with value: 0.8545181260698982.[0m




[32m[I 2023-04-13 02:07:07,939][0m Trial 22 finished with value: 0.8610520395670522 and parameters: {'n_estimators': 200, 'learning_rate': 0.25, 'num_leaves': 24, 'max_depth': 4, 'subsample': 0.84, 'colsample_bytree': 0.7799999999999999, 'random_state': 346, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'min_data_in_leaf': 260}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:08,510][0m Trial 23 finished with value: 0.8559585137651126 and parameters: {'n_estimators': 400, 'learning_rate': 0.19, 'num_leaves': 29, 'max_depth': 3, 'subsample': 0.7799999999999999, 'colsample_bytree': 0.97, 'random_state': 648, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'min_data_in_leaf': 120}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:09,480][0m Trial 24 finished with value: 0.890023701841338 and parameters: {'n_estimators': 500, 'learning_rate': 0.18000000000000002, 'num_leaves': 29, 'max_depth': 5, 'subsample': 0.7799999999999999, 'colsample_bytree': 0.9199999999999999, 'random_state': 655, 'reg_alpha': 0.2, 'reg_lambda': 0.4, 'min_data_in_leaf': 120}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:10,194][0m Trial 25 finished with value: 0.8650940751837275 and parameters: {'n_estimators': 400, 'learning_rate': 0.17, 'num_leaves': 34, 'max_depth': 4, 'subsample': 0.74, 'colsample_bytree': 0.97, 'random_state': 864, 'reg_alpha': 0.1, 'reg_lambda': 0.2, 'min_data_in_leaf': 120}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:10,651][0m Trial 26 finished with value: 0.8649369003392485 and parameters: {'n_estimators': 300, 'learning_rate': 0.2, 'num_leaves': 38, 'max_depth': 3, 'subsample': 0.8799999999999999, 'colsample_bytree': 0.9299999999999999, 'random_state': 715, 'reg_alpha': 0.1, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 60}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:11,760][0m Trial 27 finished with value: 0.8720881025923501 and parameters: {'n_estimators': 600, 'learning_rate': 0.11, 'num_leaves': 33, 'max_depth': 5, 'subsample': 0.87, 'colsample_bytree': 0.8899999999999999, 'random_state': 601, 'reg_alpha': 0.2, 'reg_lambda': 0.4, 'min_data_in_leaf': 180}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:13,242][0m Trial 28 finished with value: 1.031550331229777 and parameters: {'n_estimators': 500, 'learning_rate': 0.27, 'num_leaves': 28, 'max_depth': 10, 'subsample': 0.82, 'colsample_bytree': 0.84, 'random_state': 495, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 140}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:14,395][0m Trial 29 finished with value: 0.8813507463798294 and parameters: {'n_estimators': 700, 'learning_rate': 0.16, 'num_leaves': 23, 'max_depth': 4, 'subsample': 0.85, 'colsample_bytree': 0.8099999999999999, 'random_state': 786, 'reg_alpha': 0.5, 'reg_lambda': 0.1, 'min_data_in_leaf': 100}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)
[32m[I 2023-04-13 02:07:14,672][0m Trial 30 finished with value: 0.883221736747091 and parameters: {'n_estimators': 100, 'learning_rate': 0.04, 'num_leaves': 32, 'max_depth': 4, 'subsample': 0.77, 'colsample_bytree': 0.7799999999999999, 'random_state': 921, 'reg_alpha': 0.4, 'reg_lambda': 0.2, 'min_data_in_leaf': 60}. Best is trial 15 with value: 0.8545181260698982.[0m




  s = one / (x * x)




[32m[I 2023-04-13 02:07:15,000][0m Trial 31 finished with value: 0.860299303223608 and parameters: {'n_estimators': 200, 'learning_rate': 0.23, 'num_leaves': 15, 'max_depth': 3, 'subsample': 0.7999999999999999, 'colsample_bytree': 0.97, 'random_state': 442, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'min_data_in_leaf': 260}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:15,438][0m Trial 32 finished with value: 0.8653168111904908 and parameters: {'n_estimators': 300, 'learning_rate': 0.25, 'num_leaves': 17, 'max_depth': 3, 'subsample': 0.84, 'colsample_bytree': 1.0, 'random_state': 257, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'min_data_in_leaf': 280}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:15,780][0m Trial 33 finished with value: 0.8609177785322943 and parameters: {'n_estimators': 200, 'learning_rate': 0.13, 'num_leaves': 11, 'max_depth': 3, 'subsample': 0.7899999999999999, 'colsample_bytree': 0.95, 'random_state': 529, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'min_data_in_leaf': 220}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:16,527][0m Trial 34 finished with value: 0.8790673821755033 and parameters: {'n_estimators': 400, 'learning_rate': 0.2, 'num_leaves': 19, 'max_depth': 5, 'subsample': 0.8099999999999999, 'colsample_bytree': 0.98, 'random_state': 327, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'min_data_in_leaf': 280}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)
[32m[I 2023-04-13 02:07:16,758][0m Trial 35 finished with value: 0.8608330498831406 and parameters: {'n_estimators': 100, 'learning_rate': 0.15000000000000002, 'num_leaves': 22, 'max_depth': 3, 'subsample': 0.75, 'colsample_bytree': 0.74, 'random_state': 175, 'reg_alpha': 0.0, 'reg_lambda': 0.4, 'min_data_in_leaf': 80}. Best is trial 15 with value: 0.8545181260698982.[0m




  s = one / (x * x)




[32m[I 2023-04-13 02:07:20,054][0m Trial 36 finished with value: 0.9824244519036736 and parameters: {'n_estimators': 1300, 'learning_rate': 0.13, 'num_leaves': 37, 'max_depth': 9, 'subsample': 0.73, 'colsample_bytree': 0.95, 'random_state': 695, 'reg_alpha': 0.1, 'reg_lambda': 0.0, 'min_data_in_leaf': 220}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:20,652][0m Trial 37 finished with value: 0.8729898047264175 and parameters: {'n_estimators': 300, 'learning_rate': 0.23, 'num_leaves': 12, 'max_depth': 6, 'subsample': 0.7, 'colsample_bytree': 0.9199999999999999, 'random_state': 457, 'reg_alpha': 0.0, 'reg_lambda': 0.4, 'min_data_in_leaf': 240}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:20,995][0m Trial 38 finished with value: 0.8561953656869773 and parameters: {'n_estimators': 100, 'learning_rate': 0.09, 'num_leaves': 18, 'max_depth': 10, 'subsample': 0.77, 'colsample_bytree': 0.86, 'random_state': 414, 'reg_alpha': 0.2, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 200}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:21,398][0m Trial 39 finished with value: 0.8571436113429948 and parameters: {'n_estimators': 100, 'learning_rate': 0.09, 'num_leaves': 29, 'max_depth': 11, 'subsample': 0.77, 'colsample_bytree': 0.86, 'random_state': 627, 'reg_alpha': 0.2, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 200}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:25,285][0m Trial 40 finished with value: 0.8981215509480328 and parameters: {'n_estimators': 1500, 'learning_rate': 0.04, 'num_leaves': 27, 'max_depth': 10, 'subsample': 0.7799999999999999, 'colsample_bytree': 0.76, 'random_state': 692, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 180}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:25,703][0m Trial 41 finished with value: 0.8571436113429948 and parameters: {'n_estimators': 100, 'learning_rate': 0.09, 'num_leaves': 29, 'max_depth': 11, 'subsample': 0.77, 'colsample_bytree': 0.86, 'random_state': 627, 'reg_alpha': 0.2, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 200}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:26,138][0m Trial 42 finished with value: 0.8566844893986635 and parameters: {'n_estimators': 100, 'learning_rate': 0.09, 'num_leaves': 33, 'max_depth': 12, 'subsample': 0.76, 'colsample_bytree': 0.87, 'random_state': 560, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.2, 'min_data_in_leaf': 120}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:26,930][0m Trial 43 finished with value: 0.8575048936905744 and parameters: {'n_estimators': 200, 'learning_rate': 0.060000000000000005, 'num_leaves': 35, 'max_depth': 12, 'subsample': 0.71, 'colsample_bytree': 0.8899999999999999, 'random_state': 546, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.1, 'min_data_in_leaf': 120}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:27,728][0m Trial 44 finished with value: 0.8883487198314105 and parameters: {'n_estimators': 200, 'learning_rate': 0.01, 'num_leaves': 32, 'max_depth': 12, 'subsample': 0.73, 'colsample_bytree': 0.83, 'random_state': 374, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.2, 'min_data_in_leaf': 140}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:28,952][0m Trial 45 finished with value: 0.8891305756318396 and parameters: {'n_estimators': 400, 'learning_rate': 0.11, 'num_leaves': 40, 'max_depth': 9, 'subsample': 0.75, 'colsample_bytree': 0.8999999999999999, 'random_state': 753, 'reg_alpha': 0.5, 'reg_lambda': 0.2, 'min_data_in_leaf': 120}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:30,000][0m Trial 46 finished with value: 0.9209991180178793 and parameters: {'n_estimators': 300, 'learning_rate': 0.14, 'num_leaves': 37, 'max_depth': 11, 'subsample': 0.7899999999999999, 'colsample_bytree': 0.7899999999999999, 'random_state': 489, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'min_data_in_leaf': 80}. Best is trial 15 with value: 0.8545181260698982.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:30,359][0m Trial 47 finished with value: 0.853277117157562 and parameters: {'n_estimators': 100, 'learning_rate': 0.09999999999999999, 'num_leaves': 18, 'max_depth': 10, 'subsample': 0.83, 'colsample_bytree': 0.85, 'random_state': 674, 'reg_alpha': 0.4, 'reg_lambda': 0.2, 'min_data_in_leaf': 160}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:31,340][0m Trial 48 finished with value: 0.8607876564272559 and parameters: {'n_estimators': 400, 'learning_rate': 0.06999999999999999, 'num_leaves': 17, 'max_depth': 9, 'subsample': 0.83, 'colsample_bytree': 0.8099999999999999, 'random_state': 841, 'reg_alpha': 0.4, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 160}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:32,514][0m Trial 49 finished with value: 0.8789831131789271 and parameters: {'n_estimators': 500, 'learning_rate': 0.09999999999999999, 'num_leaves': 19, 'max_depth': 8, 'subsample': 0.86, 'colsample_bytree': 0.72, 'random_state': 658, 'reg_alpha': 0.4, 'reg_lambda': 0.2, 'min_data_in_leaf': 160}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:32,900][0m Trial 50 finished with value: 0.856489307218293 and parameters: {'n_estimators': 100, 'learning_rate': 0.16, 'num_leaves': 25, 'max_depth': 10, 'subsample': 0.8899999999999999, 'colsample_bytree': 0.85, 'random_state': 408, 'reg_alpha': 0.5, 'reg_lambda': 0.4, 'min_data_in_leaf': 140}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:33,284][0m Trial 51 finished with value: 0.8572710324546572 and parameters: {'n_estimators': 100, 'learning_rate': 0.16, 'num_leaves': 25, 'max_depth': 10, 'subsample': 0.8999999999999999, 'colsample_bytree': 0.85, 'random_state': 413, 'reg_alpha': 0.5, 'reg_lambda': 0.4, 'min_data_in_leaf': 140}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)




[32m[I 2023-04-13 02:07:34,150][0m Trial 52 finished with value: 0.8895711905044973 and parameters: {'n_estimators': 300, 'learning_rate': 0.17, 'num_leaves': 21, 'max_depth': 10, 'subsample': 0.8799999999999999, 'colsample_bytree': 0.87, 'random_state': 309, 'reg_alpha': 0.5, 'reg_lambda': 0.4, 'min_data_in_leaf': 180}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:34,811][0m Trial 53 finished with value: 0.8716007905515658 and parameters: {'n_estimators': 200, 'learning_rate': 0.14, 'num_leaves': 27, 'max_depth': 9, 'subsample': 0.9099999999999999, 'colsample_bytree': 0.83, 'random_state': 392, 'reg_alpha': 0.5, 'reg_lambda': 0.30000000000000004, 'min_data_in_leaf': 100}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:35,320][0m Trial 54 finished with value: 0.8533628902571125 and parameters: {'n_estimators': 200, 'learning_rate': 0.12, 'num_leaves': 14, 'max_depth': 7, 'subsample': 1.0, 'colsample_bytree': 0.7999999999999999, 'random_state': 593, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 80}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:35,664][0m Trial 55 finished with value: 0.8538689998564257 and parameters: {'n_estimators': 100, 'learning_rate': 0.12, 'num_leaves': 14, 'max_depth': 10, 'subsample': 0.99, 'colsample_bytree': 0.7999999999999999, 'random_state': 603, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 40}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:36,131][0m Trial 56 finished with value: 0.8576164488091745 and parameters: {'n_estimators': 200, 'learning_rate': 0.13, 'num_leaves': 13, 'max_depth': 8, 'subsample': 1.0, 'colsample_bytree': 0.76, 'random_state': 768, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 40}. Best is trial 47 with value: 0.853277117157562.[0m
  s = one / (x * x)
  s = one / (x * x)
[32m[I 2023-04-13 02:07:36,401][0m Trial 57 finished with value: 0.8599110873943607 and parameters: {'n_estimators': 100, 'learning_rate': 0.12, 'num_leaves': 8, 'max_depth': 11, 'subsample': 0.95, 'colsample_bytree': 0.7899999999999999, 'random_state': 592, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 40}. Best is trial 47 with value: 0.853277117157562.[0m




  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:36,894][0m Trial 58 finished with value: 0.8526264933912842 and parameters: {'n_estimators': 200, 'learning_rate': 0.09999999999999999, 'num_leaves': 14, 'max_depth': 7, 'subsample': 0.98, 'colsample_bytree': 0.8099999999999999, 'random_state': 705, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 20}. Best is trial 58 with value: 0.8526264933912842.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:39,093][0m Trial 59 finished with value: 1.0986353298389544 and parameters: {'n_estimators': 1200, 'learning_rate': 0.21000000000000002, 'num_leaves': 13, 'max_depth': 7, 'subsample': 0.98, 'colsample_bytree': 0.77, 'random_state': 685, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 0}. Best is trial 58 with value: 0.8526264933912842.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:39,667][0m Trial 60 finished with value: 0.8556819315419725 and parameters: {'n_estimators': 300, 'learning_rate': 0.12, 'num_leaves': 9, 'max_depth': 6, 'subsample': 0.99, 'colsample_bytree': 0.7999999999999999, 'random_state': 809, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 80}. Best is trial 58 with value: 0.8526264933912842.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:40,217][0m Trial 61 finished with value: 0.8547676279412738 and parameters: {'n_estimators': 300, 'learning_rate': 0.12, 'num_leaves': 9, 'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.7999999999999999, 'random_state': 812, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 80}. Best is trial 58 with value: 0.8526264933912842.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:40,758][0m Trial 62 finished with value: 0.8501529945902075 and parameters: {'n_estimators': 300, 'learning_rate': 0.12, 'num_leaves': 8, 'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.7999999999999999, 'random_state': 908, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:41,111][0m Trial 63 finished with value: 0.86124488895902 and parameters: {'n_estimators': 200, 'learning_rate': 0.09999999999999999, 'num_leaves': 5, 'max_depth': 6, 'subsample': 0.97, 'colsample_bytree': 0.82, 'random_state': 965, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 40}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:41,818][0m Trial 64 finished with value: 0.8712490226992651 and parameters: {'n_estimators': 300, 'learning_rate': 0.12, 'num_leaves': 15, 'max_depth': 7, 'subsample': 1.0, 'colsample_bytree': 0.7999999999999999, 'random_state': 887, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:42,258][0m Trial 65 finished with value: 0.8511378047550536 and parameters: {'n_estimators': 200, 'learning_rate': 0.09999999999999999, 'num_leaves': 10, 'max_depth': 7, 'subsample': 0.95, 'colsample_bytree': 0.8099999999999999, 'random_state': 721, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:42,787][0m Trial 66 finished with value: 0.8540378765285697 and parameters: {'n_estimators': 200, 'learning_rate': 0.08, 'num_leaves': 14, 'max_depth': 7, 'subsample': 0.95, 'colsample_bytree': 0.83, 'random_state': 732, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:43,287][0m Trial 67 finished with value: 0.8527142227634966 and parameters: {'n_estimators': 200, 'learning_rate': 0.08, 'num_leaves': 14, 'max_depth': 8, 'subsample': 0.95, 'colsample_bytree': 0.83, 'random_state': 602, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:43,768][0m Trial 68 finished with value: 0.8521113693097899 and parameters: {'n_estimators': 200, 'learning_rate': 0.09999999999999999, 'num_leaves': 11, 'max_depth': 8, 'subsample': 0.97, 'colsample_bytree': 0.7799999999999999, 'random_state': 669, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:45,082][0m Trial 69 finished with value: 0.8690137583416636 and parameters: {'n_estimators': 900, 'learning_rate': 0.09999999999999999, 'num_leaves': 7, 'max_depth': 8, 'subsample': 0.94, 'colsample_bytree': 0.7799999999999999, 'random_state': 661, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:45,555][0m Trial 70 finished with value: 0.8539839868256924 and parameters: {'n_estimators': 200, 'learning_rate': 0.060000000000000005, 'num_leaves': 11, 'max_depth': 7, 'subsample': 0.97, 'colsample_bytree': 0.82, 'random_state': 719, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:46,031][0m Trial 71 finished with value: 0.8544399407793513 and parameters: {'n_estimators': 200, 'learning_rate': 0.08, 'num_leaves': 11, 'max_depth': 8, 'subsample': 0.98, 'colsample_bytree': 0.7899999999999999, 'random_state': 615, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)
[32m[I 2023-04-13 02:07:46,325][0m Trial 72 finished with value: 0.8587010929244581 and parameters: {'n_estimators': 100, 'learning_rate': 0.11, 'num_leaves': 10, 'max_depth': 7, 'subsample': 0.96, 'colsample_bytree': 0.8099999999999999, 'random_state': 601, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m




  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:46,846][0m Trial 73 finished with value: 0.8567838284626182 and parameters: {'n_estimators': 200, 'learning_rate': 0.11, 'num_leaves': 15, 'max_depth': 6, 'subsample': 0.99, 'colsample_bytree': 0.84, 'random_state': 516, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 40}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)
[32m[I 2023-04-13 02:07:47,109][0m Trial 74 finished with value: 0.8821174018978833 and parameters: {'n_estimators': 100, 'learning_rate': 0.06999999999999999, 'num_leaves': 6, 'max_depth': 8, 'subsample': 0.97, 'colsample_bytree': 0.77, 'random_state': 678, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 60}. Best is trial 62 with value: 0.8501529945902075.[0m




  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:47,604][0m Trial 75 finished with value: 0.8559602881597542 and parameters: {'n_estimators': 200, 'learning_rate': 0.09999999999999999, 'num_leaves': 13, 'max_depth': 7, 'subsample': 0.99, 'colsample_bytree': 0.75, 'random_state': 577, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:48,512][0m Trial 76 finished with value: 0.9085894167359614 and parameters: {'n_estimators': 400, 'learning_rate': 0.14, 'num_leaves': 16, 'max_depth': 9, 'subsample': 0.9299999999999999, 'colsample_bytree': 0.82, 'random_state': 705, 'reg_alpha': 0.4, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:48,849][0m Trial 77 finished with value: 0.8580371621160717 and parameters: {'n_estimators': 100, 'learning_rate': 0.08, 'num_leaves': 12, 'max_depth': 8, 'subsample': 0.96, 'colsample_bytree': 0.7799999999999999, 'random_state': 745, 'reg_alpha': 0.4, 'reg_lambda': 0.4, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:49,544][0m Trial 78 finished with value: 0.852610323432328 and parameters: {'n_estimators': 300, 'learning_rate': 0.060000000000000005, 'num_leaves': 14, 'max_depth': 6, 'subsample': 0.98, 'colsample_bytree': 0.7899999999999999, 'random_state': 630, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:50,508][0m Trial 79 finished with value: 0.8536962316837828 and parameters: {'n_estimators': 400, 'learning_rate': 0.04, 'num_leaves': 17, 'max_depth': 6, 'subsample': 0.95, 'colsample_bytree': 0.7899999999999999, 'random_state': 542, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:51,124][0m Trial 80 finished with value: 0.8537468044376844 and parameters: {'n_estimators': 300, 'learning_rate': 0.060000000000000005, 'num_leaves': 10, 'max_depth': 5, 'subsample': 0.98, 'colsample_bytree': 0.77, 'random_state': 639, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 60}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:52,293][0m Trial 81 finished with value: 0.8528319822641252 and parameters: {'n_estimators': 500, 'learning_rate': 0.03, 'num_leaves': 16, 'max_depth': 6, 'subsample': 0.95, 'colsample_bytree': 0.7899999999999999, 'random_state': 549, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:53,800][0m Trial 82 finished with value: 0.8513566960125318 and parameters: {'n_estimators': 700, 'learning_rate': 0.02, 'num_leaves': 14, 'max_depth': 7, 'subsample': 0.9299999999999999, 'colsample_bytree': 0.8099999999999999, 'random_state': 53, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:55,219][0m Trial 83 finished with value: 0.8509262189181415 and parameters: {'n_estimators': 700, 'learning_rate': 0.02, 'num_leaves': 12, 'max_depth': 6, 'subsample': 0.94, 'colsample_bytree': 0.8099999999999999, 'random_state': 468, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:57,169][0m Trial 84 finished with value: 0.8509519392877821 and parameters: {'n_estimators': 700, 'learning_rate': 0.02, 'num_leaves': 12, 'max_depth': 6, 'subsample': 0.9299999999999999, 'colsample_bytree': 0.8099999999999999, 'random_state': 121, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:58,568][0m Trial 85 finished with value: 0.8673079736987509 and parameters: {'n_estimators': 700, 'learning_rate': 0.01, 'num_leaves': 10, 'max_depth': 6, 'subsample': 0.9299999999999999, 'colsample_bytree': 0.8099999999999999, 'random_state': 55, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:07:59,825][0m Trial 86 finished with value: 0.8562447399422519 and parameters: {'n_estimators': 800, 'learning_rate': 0.02, 'num_leaves': 7, 'max_depth': 7, 'subsample': 0.9199999999999999, 'colsample_bytree': 0.83, 'random_state': 84, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 40}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:01,480][0m Trial 87 finished with value: 0.8547323614949777 and parameters: {'n_estimators': 900, 'learning_rate': 0.03, 'num_leaves': 12, 'max_depth': 5, 'subsample': 0.94, 'colsample_bytree': 0.8099999999999999, 'random_state': 2, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:02,563][0m Trial 88 finished with value: 0.857441577245121 and parameters: {'n_estimators': 600, 'learning_rate': 0.02, 'num_leaves': 8, 'max_depth': 7, 'subsample': 0.94, 'colsample_bytree': 0.84, 'random_state': 167, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:03,879][0m Trial 89 finished with value: 0.8516963129326904 and parameters: {'n_estimators': 700, 'learning_rate': 0.03, 'num_leaves': 11, 'max_depth': 6, 'subsample': 0.96, 'colsample_bytree': 0.76, 'random_state': 85, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 40}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:05,251][0m Trial 90 finished with value: 0.8585380346319353 and parameters: {'n_estimators': 800, 'learning_rate': 0.05, 'num_leaves': 9, 'max_depth': 6, 'subsample': 0.97, 'colsample_bytree': 0.76, 'random_state': 111, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 40}. Best is trial 62 with value: 0.8501529945902075.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:06,576][0m Trial 91 finished with value: 0.8495905006582661 and parameters: {'n_estimators': 700, 'learning_rate': 0.03, 'num_leaves': 11, 'max_depth': 6, 'subsample': 0.9199999999999999, 'colsample_bytree': 0.7799999999999999, 'random_state': 205, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:07,885][0m Trial 92 finished with value: 0.8524174714281167 and parameters: {'n_estimators': 700, 'learning_rate': 0.03, 'num_leaves': 11, 'max_depth': 6, 'subsample': 0.9199999999999999, 'colsample_bytree': 0.73, 'random_state': 193, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:09,175][0m Trial 93 finished with value: 0.850803931095965 and parameters: {'n_estimators': 700, 'learning_rate': 0.03, 'num_leaves': 11, 'max_depth': 5, 'subsample': 0.9099999999999999, 'colsample_bytree': 0.72, 'random_state': 203, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:10,501][0m Trial 94 finished with value: 0.8520708794455057 and parameters: {'n_estimators': 700, 'learning_rate': 0.03, 'num_leaves': 11, 'max_depth': 5, 'subsample': 0.9199999999999999, 'colsample_bytree': 0.73, 'random_state': 225, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:11,836][0m Trial 95 finished with value: 0.8696723923104122 and parameters: {'n_estimators': 700, 'learning_rate': 0.01, 'num_leaves': 10, 'max_depth': 5, 'subsample': 0.9099999999999999, 'colsample_bytree': 0.7, 'random_state': 223, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:13,021][0m Trial 96 finished with value: 0.8515515084454872 and parameters: {'n_estimators': 600, 'learning_rate': 0.03, 'num_leaves': 12, 'max_depth': 5, 'subsample': 0.9099999999999999, 'colsample_bytree': 0.71, 'random_state': 129, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:14,021][0m Trial 97 finished with value: 0.8548608825436765 and parameters: {'n_estimators': 600, 'learning_rate': 0.03, 'num_leaves': 8, 'max_depth': 5, 'subsample': 0.9099999999999999, 'colsample_bytree': 0.71, 'random_state': 114, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 40}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:15,358][0m Trial 98 finished with value: 0.8576937137415095 and parameters: {'n_estimators': 700, 'learning_rate': 0.05, 'num_leaves': 12, 'max_depth': 5, 'subsample': 0.8999999999999999, 'colsample_bytree': 0.74, 'random_state': 131, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 0}. Best is trial 91 with value: 0.8495905006582661.[0m
  s = one / (x * x)
  s = one / (x * x)




[32m[I 2023-04-13 02:08:16,715][0m Trial 99 finished with value: 0.8570205136639122 and parameters: {'n_estimators': 800, 'learning_rate': 0.02, 'num_leaves': 9, 'max_depth': 5, 'subsample': 0.8899999999999999, 'colsample_bytree': 0.71, 'random_state': 213, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 60}. Best is trial 91 with value: 0.8495905006582661.[0m
[32m[I 2023-04-13 02:08:16,720][0m A new study created in memory with name: no-name-d2be2169-68ce-48b2-a008-0d9826e635dd[0m
[32m[I 2023-04-13 02:08:17,099][0m Trial 0 finished with value: 0.8643694054613087 and parameters: {'learning_rate': 0.17, 'n_estimators': 400, 'max_depth': 7, 'l2_regularization': 0.012, 'random_state': 70}. Best is trial 0 with value: 0.8643694054613087.[0m
[32m[I 2023-04-13 02:08:17,887][0m Trial 1 finished with value: 0.8565903914101798 and parameters: {'learning_rate': 0.08, 'n_estimators': 450, 'max_depth': 12, 'l2_regularization': 0.022, 'random_state': 4}. Best is trial 1 with val

In [28]:
## Printing best hyper-parameter set
print('Random Forest: \n', study_rf.best_trial.params)
print(study_rf.best_trial.value)

## Printing best hyper-parameter set
print('\nHistGB: \n', study_hist.best_trial.params)
print(study_hist.best_trial.value)

## Printing best hyper-parameter set
print('\nXGBoost: \n', study_xgb.best_trial.params)
print(study_xgb.best_trial.value)

## Printing best hyper-parameter set
print('\nLightGBM: \n', study_lgbm.best_trial.params)
print(study_lgbm.best_trial.value)

Random Forest: 
 {'n_estimators': 750, 'max_depth': 12, 'min_samples_split': 15, 'min_samples_leaf': 3, 'random_state': 37, 'max_features': 'sqrt'}
0.8694903998084856

HistGB: 
 {'learning_rate': 0.08, 'n_estimators': 400, 'max_depth': 15, 'l2_regularization': 0.024, 'random_state': 120}
0.8547290647668944

XGBoost: 
 {'n_estimators': 550, 'learning_rate': 0.04, 'max_depth': 7, 'gamma': 0.0, 'min_child_weight': 100, 'subsample': 0.7, 'colsample_bytree': 0.8, 'seed': 889}
0.8579201047288536

LightGBM: 
 {'n_estimators': 700, 'learning_rate': 0.03, 'num_leaves': 11, 'max_depth': 6, 'subsample': 0.9199999999999999, 'colsample_bytree': 0.7799999999999999, 'random_state': 205, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'min_data_in_leaf': 20}
0.8495905006582661


In [38]:
## Reading the data
train = pd.read_csv('Data/Training.csv')
val = pd.read_csv('Data/Validation.csv')
test = pd.read_csv('Data/Testing.csv')
sub = pd.read_csv('Data/Sample_Submission.csv')

## Defining the input and target variables
X_train = train[['price', 'price_per_bedroom', 'No_Fee', 'distance', 'longitude', 'Quality_Score', 'latitude', 'Hardwood_Floor', 'Amenity_Score']]
Y_train = train['interest_level']

X_validation = val[['price', 'price_per_bedroom', 'No_Fee', 'distance', 'longitude', 'Quality_Score', 'latitude', 'Hardwood_Floor', 'Amenity_Score']] 
Y_validation = val['interest_level']

X_test = test[['price', 'price_per_bedroom', 'No_Fee', 'distance', 'longitude', 'Quality_Score', 'latitude', 'Hardwood_Floor', 'Amenity_Score']] 

In [44]:
rf_md = RandomForestClassifier(n_estimators=750, max_depth=12, min_samples_split=15, min_samples_leaf=3, 
                               random_state=37, max_features='sqrt').fit(X_train, Y_train)

hist_md = HistGradientBoostingClassifier(learning_rate=0.08, max_iter=400, max_depth=15, l2_regularization=0.024, 
                                         random_state=120).fit(X_train, Y_train)

lgbm_md = LGBMClassifier(n_estimators=700, learning_rate=0.03, num_leaves=11, max_depth=6, subsample=0.92, colsample_bytree=0.78, 
                         random_state=205, reg_alpha=0.5, reg_lambda=0.5, min_data_in_leaf=20).fit(X_train, Y_train)

In [45]:
rf_train_preds = rf_md.predict_proba(X_train)
rf_val_preds = rf_md.predict_proba(X_validation)
rf_test_preds = pd.DataFrame(rf_md.predict_proba(X_test), columns = ['high', 'low', 'medium'])

hist_train_preds = hist_md.predict_proba(X_train)
hist_val_preds = hist_md.predict_proba(X_validation)
hist_test_preds = pd.DataFrame(hist_md.predict_proba(X_test), columns = ['high', 'low', 'medium'])

lgbm_train_preds = lgbm_md.predict_proba(X_train)
lgbm_val_preds = lgbm_md.predict_proba(X_validation)
lgbm_test_preds = pd.DataFrame(lgbm_md.predict_proba(X_test), columns = ['high', 'low', 'medium'])

In [46]:
## Configuring submission
sub[['high', 'medium', 'low']] = rf_test_preds[['high', 'medium', 'low']]
sub.to_csv('Submissions/rf.csv', index = False)

sub[['high', 'medium', 'low']] = hist_test_preds[['high', 'medium', 'low']]
sub.to_csv('Submissions/hist.csv', index = False)

sub[['high', 'medium', 'low']] = lgbm_test_preds[['high', 'medium', 'low']]
sub.to_csv('Submissions/lgbm.csv', index = False)

In [40]:
stacker = VotingClassifier(estimators = 
                        [('rf', RandomForestClassifier(n_estimators=750, max_depth=12, min_samples_split=15, min_samples_leaf=3, 
                                                       random_state=37, max_features='sqrt')), 
                         ('hist', HistGradientBoostingClassifier(learning_rate=0.08, max_iter=400, max_depth=15, 
                                                                 l2_regularization=0.024, random_state=120)), 
                         ('lgbm', LGBMClassifier(n_estimators=700, learning_rate=0.03, num_leaves=11, max_depth=6, subsample=0.92, 
                                                 colsample_bytree=0.78, random_state=205, reg_alpha=0.5, reg_lambda=0.5, 
                                                 min_data_in_leaf=20)), 
                         ('xgb', XGBClassifier(n_estimators=550, learning_rate=0.04, max_depth=7, gamma=0, min_child_weight=100, 
                                               subsample=0.7, colsample_bytree=0.8, seed=889))], 
                        voting = 'soft', weights = [1, 1, 2, 1]).fit(X_train, Y_train)

train_preds = stacker.predict_proba(X_train)
val_preds = stacker.predict_proba(X_validation)
test_preds = pd.DataFrame(stacker.predict_proba(X_test), columns = ['high', 'low', 'medium'])

train_error = log_loss(Y_train, train_preds)
val_error = log_loss(Y_validation, val_preds)

## Configuring submission
sub[['high', 'medium', 'low']] = test_preds[['high', 'medium', 'low']]
sub.to_csv('Submissions/voting_classifier.csv', index = False)



In [41]:
sub

Unnamed: 0,ID,high,medium,low
0,1,0.019303,0.110887,0.86981
1,2,0.018101,0.10705,0.874849
2,3,0.026863,0.104617,0.86852
3,4,0.028673,0.085454,0.885873
4,5,0.019121,0.075232,0.905647
5,6,0.020466,0.076489,0.903045
6,7,0.021172,0.08282,0.896008
7,8,0.028548,0.11036,0.861092
8,9,0.019121,0.075232,0.905647
9,10,0.026248,0.086978,0.886774


#### Modelling

In [None]:
## Reading the data
train = pd.read_csv('Data/Training.csv')
val = pd.read_csv('Data/Validation.csv')
test = pd.read_csv('Data/Testing.csv')

## Defining the input and target variables
X_train = train.drop(columns = ['interest_level']); Y_train = train['interest_level']
X_validation = val.drop(columns = ['interest_level']); Y_validation = val['interest_level']
X_test = test.drop(columns = ['ID'])

#### Modelling: One vs. Rest Classifier

Three-Layer Stacking Approach: 
- Building a series of models
- Building a few meta learners
- Averaging the meta learner predictions

In [None]:
## Building the multi-classifier (using LGBM)
one_vs_all_lgbm = OneVsRestClassifier(estimator = LGBMClassifier()).fit(X_train, Y_train)

## Predicting on the test
train_preds1 = one_vs_all_lgbm.predict_proba(X_train)
val_preds1 = one_vs_all_lgbm.predict_proba(X_validation)
test_preds1 = one_vs_all_lgbm.predict_proba(X_test)

## Predictions to data-frames
train_preds1 = pd.DataFrame(train_preds1, columns = ['high', 'low', 'medium'])
val_preds1 = pd.DataFrame(val_preds1, columns = ['high', 'low', 'medium'])
test_preds1 = pd.DataFrame(test_preds1, columns = ['high', 'low', 'medium'])

## Printing performance on the training set
train_error = log_loss(Y_train, train_preds1)
val_error = log_loss(Y_validation, val_preds1)
print(f'Training Error: {train_error}')
print(f'Validation Error: {val_error}')

In [None]:
## Building the multi-classifier (using XGB)
one_vs_all_xgb = OneVsRestClassifier(estimator = XGBClassifier()).fit(X_train, Y_train)

## Predicting on the test
train_preds2 = one_vs_all_xgb.predict_proba(X_train)
val_preds2 = one_vs_all_xgb.predict_proba(X_validation)
test_preds2 = one_vs_all_xgb.predict_proba(X_test)

## Predictions to data-frames
train_preds2 = pd.DataFrame(train_preds2, columns = ['high', 'low', 'medium'])
val_preds2 = pd.DataFrame(val_preds2, columns = ['high', 'low', 'medium'])
test_preds2 = pd.DataFrame(test_preds2, columns = ['high', 'low', 'medium'])

## Printing performance on the training set
train_error = log_loss(Y_train, train_preds2)
val_error = log_loss(Y_validation, val_preds2)
print(f'Training Error: {train_error}')
print(f'Validation Error: {val_error}')

In [None]:
## Building the multi-classifier (using RF)
one_vs_all_rf = OneVsRestClassifier(estimator = RandomForestClassifier(max_depth = 6, n_estimators = 500)).fit(X_train, Y_train)

## Predicting on the test
train_preds3 = one_vs_all_rf.predict_proba(X_train)
val_preds3 = one_vs_all_rf.predict_proba(X_validation)
test_preds3 = one_vs_all_rf.predict_proba(X_test)

## Predictions to data-frames
train_preds3 = pd.DataFrame(train_preds3, columns = ['high', 'low', 'medium'])
val_preds3 = pd.DataFrame(val_preds3, columns = ['high', 'low', 'medium'])
test_preds3 = pd.DataFrame(test_preds3, columns = ['high', 'low', 'medium'])

## Printing performance on the training set
train_error = log_loss(Y_train, train_preds3)
val_error = log_loss(Y_validation, val_preds3)
print(f'Training Error: {train_error}')
print(f'Validation Error: {val_error}')

In [None]:
## Building the multi-classifier (using Hist)
one_vs_all_hist = OneVsRestClassifier(estimator = HistGradientBoostingClassifier()).fit(X_train, Y_train)

## Predicting on the test
train_preds4 = one_vs_all_hist.predict_proba(X_train)
val_preds4 = one_vs_all_hist.predict_proba(X_validation)
test_preds4 = one_vs_all_hist.predict_proba(X_test)

## Predictions to data-frames
train_preds4 = pd.DataFrame(train_preds4, columns = ['high', 'low', 'medium'])
val_preds4 = pd.DataFrame(val_preds4, columns = ['high', 'low', 'medium'])
test_preds4 = pd.DataFrame(test_preds4, columns = ['high', 'low', 'medium'])

## Printing performance on the training set
train_error = log_loss(Y_train, train_preds4)
val_error = log_loss(Y_validation, val_preds4)
print(f'Training Error: {train_error}')
print(f'Validation Error: {val_error}')

In [None]:
## Building the multi-classifier (using Ada)
one_vs_all_ada = OneVsRestClassifier(estimator = AdaBoostClassifier()).fit(X_train, Y_train)

## Predicting on the test
train_preds5 = one_vs_all_ada.predict_proba(X_train)
val_preds5 = one_vs_all_ada.predict_proba(X_validation)
test_preds5 = one_vs_all_ada.predict_proba(X_test)

## Predictions to data-frames
train_preds5 = pd.DataFrame(train_preds5, columns = ['high', 'low', 'medium'])
val_preds5 = pd.DataFrame(val_preds5, columns = ['high', 'low', 'medium'])
test_preds5 = pd.DataFrame(test_preds5, columns = ['high', 'low', 'medium'])

## Printing performance on the training set
train_error = log_loss(Y_train, train_preds5)
val_error = log_loss(Y_validation, val_preds5)
print(f'Training Error: {train_error}')
print(f'Validation Error: {val_error}')

In [None]:
train_preds1

In [None]:
estimators = [('rf', RandomForestClassifier(max_depth = 6, n_estimators = 500)), ('lgbm', LGBMClassifier()), 
              ('xgb', XGBClassifier()), ('hist', HistGradientBoostingClassifier()), ('ada', AdaBoostClassifier())]

stacker1 = StackingClassifier(estimators = estimators, final_estimator = RandomForestClassifier()).fit(X_train, Y_train)
stacker2 = StackingClassifier(estimators = estimators, final_estimator = LGBMClassifier()).fit(X_train, Y_train)
stacker3 = StackingClassifier(estimators = estimators, final_estimator = HistGradientBoostingClassifier()).fit(X_train, Y_train)

print(stacker1.score(X_train, Y_train))
print(stacker1.score(X_validation, Y_validation))

print(stacker2.score(X_train, Y_train))
print(stacker2.score(X_validation, Y_validation))

print(stacker3.score(X_train, Y_train))
print(stacker3.score(X_validation, Y_validation))

In [None]:
preds1 = pd.DataFrame(stacker1.predict_proba(X_test), columns = ['high', 'low', 'medium'])
preds2 = pd.DataFrame(stacker2.predict_proba(X_test), columns = ['high', 'low', 'medium'])
preds3 = pd.DataFrame(stacker3.predict_proba(X_test), columns = ['high', 'low', 'medium'])

In [None]:
## Configuring submission
sub[['high', 'medium', 'low']] = preds1[['high', 'medium', 'low']]
sub.to_csv('Submissions/stacker1.csv', index = False)

sub[['high', 'medium', 'low']] = preds2[['high', 'medium', 'low']]
sub.to_csv('Submissions/stacker2.csv', index = False)

sub[['high', 'medium', 'low']] = preds3[['high', 'medium', 'low']]
sub.to_csv('Submissions/stacker3.csv', index = False)

In [None]:
avg_preds = (preds2 + preds3) / 2
sub[['high', 'medium', 'low']] = avg_preds[['high', 'medium', 'low']]
sub.to_csv('Submissions/avg_stacker.csv', index = False)

#### Modelling: One vs. One Classifier

In [None]:
## Building the multi-classifier (using RF)
one_vs_one_RF = OneVsOneClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicting on the test
one_vs_one_RF_pred = one_vs_one_RF.predict(X_test)



## Configuring submission
sub[['high', 'medium', 'low']] = test_preds[['high', 'medium', 'low']]
sub.to_csv('Submissions/LGBM_2.csv', index = False)

#### Principal Component Analysis Work

#### Constructing Neural Networks