# 1 Preliminaries

## 1.0 Context

* About Seattle dataframe 
* Small démo of gargaml usage 

## 1.1 System

In [1]:
!pwd

/home/alex/gargaml/examples


In [2]:
cd ..

/home/alex/gargaml


In [3]:
!pwd

/home/alex/gargaml


In [4]:
!ls

CHANGELOG.md  examples	pyproject.toml	  results   utils
docs	      gargaml	README.md	  setup.py
env	      LICENSE	requirements.txt  tests


In [5]:
!uname -a

Linux asus2023 6.2.0-20-generic #20-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  6 07:48:48 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux


In [6]:
!which python

/home/alex/gargaml/env/bin/python


## 1.2 Install

In [7]:
# if needed :

# !pip install -r requirements.txt

In [8]:
# if needed :

# !pip freeze > requirements.freeze

## 1.3 Import

In [9]:
# in one line :

from gargaml import *



In [10]:
# or manually :

# import random, os, sys, warnings, datetime, time, logging
# from IPython.display import display
# # pandarallel

# import pandas as pd
# import numpy as np

# import scipy as sp
# import scipy.stats as st

# import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.express as px
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go
# import plotly.io as pio
# import missingno as msno


# from sklearn.base import *

# from sklearn.feature_selection import *
# from sklearn.feature_extraction import *
# from sklearn.preprocessing import *
# from sklearn.pipeline import *
# from sklearn.covariance import *
# from sklearn.decomposition import *
# from sklearn.model_selection import *
# from sklearn.impute import *
# from sklearn.metrics import *
# from sklearn.cluster import *
# from sklearn.compose import *

# from sklearn.dummy import *
# from sklearn.linear_model import *
# from sklearn.svm import *
# from sklearn.neighbors import *
# from sklearn.ensemble import *


# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline

# from xgboost import XGBRegressor, XGBRFRegressor

# ...

## 1.4 Data

In [13]:
# all datasets avialable :

Data.load.list_all

['ames',
 'boston',
 'fashion',
 'food',
 'house',
 'hr',
 'iris',
 'mnist',
 'seattle',
 'titanic',
 'wine']

In [15]:
# seattle 2016 :

df = Data.load.seattle(year="2016")
df.head(2)

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,Address,City,State,ZipCode,TaxParcelIdentificationNumber,...,Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,Comments,ComplianceStatus,Outlier,TotalGHGEmissions,GHGEmissionsIntensity
0,1,2016,NonResidential,Hotel,Mayflower park hotel,405 Olive way,Seattle,WA,98101.0,659000030,...,1156514.0,3946027.0,12764.5293,1276453.0,False,,Compliant,,249.98,2.83
1,2,2016,NonResidential,Hotel,Paramount Hotel,724 Pine street,Seattle,WA,98101.0,659000220,...,950425.2,3242851.0,51450.81641,5145082.0,False,,Compliant,,295.86,2.86


## 1.5 Options and Graphics 

In [16]:
# boot seaborn  :

sns.set()

In [17]:
# usefull consts :

DISPLAY = True
FRAC = 1.0
LAZY = False

In [18]:
# warning messages :

warnings.filterwarnings(action="once")

# or :
# warnings.filterwarnings('ignore')

In [19]:
# if png and not fancy graphs with plotly :

# pio.renderers.default = "png"

## 1.6 Third parties and utils

In [20]:
# placeholder

In [21]:
results = ML.results("test")

## 1.7 Functions and class

In [22]:
# placeholder

# 2 First Tour

## 2.0 Pre-cleaning

In [23]:
# about columns quality :

df.columns

Index(['OSEBuildingID', 'DataYear', 'BuildingType', 'PrimaryPropertyType',
       'PropertyName', 'Address', 'City', 'State', 'ZipCode',
       'TaxParcelIdentificationNumber', 'CouncilDistrictCode', 'Neighborhood',
       'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings',
       'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking',
       'PropertyGFABuilding(s)', 'ListOfAllPropertyUseTypes',
       'LargestPropertyUseType', 'LargestPropertyUseTypeGFA',
       'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA',
       'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA',
       'YearsENERGYSTARCertified', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)',
       'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
       'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)',
       'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)',
       'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'ComplianceStatus',
       'Outlier

In [24]:
# clean columns :


def clean(txt):
    txt = txt.lower().strip()
    replace = [
        ("(s)", ""),
        ("(", "_"),
        (")", ""),
        ("/", "_"),
        ("__", "_"),
    ]

    for k, v in replace:
        txt = txt.replace(k, v)

    txt = txt.lower().strip()
    return txt


df.columns = map(clean, df.columns)
df.columns

Index(['osebuildingid', 'datayear', 'buildingtype', 'primarypropertytype',
       'propertyname', 'address', 'city', 'state', 'zipcode',
       'taxparcelidentificationnumber', 'councildistrictcode', 'neighborhood',
       'latitude', 'longitude', 'yearbuilt', 'numberofbuildings',
       'numberoffloors', 'propertygfatotal', 'propertygfaparking',
       'propertygfabuilding', 'listofallpropertyusetypes',
       'largestpropertyusetype', 'largestpropertyusetypegfa',
       'secondlargestpropertyusetype', 'secondlargestpropertyusetypegfa',
       'thirdlargestpropertyusetype', 'thirdlargestpropertyusetypegfa',
       'yearsenergystarcertified', 'energystarscore', 'siteeui_kbtu_sf',
       'siteeuiwn_kbtu_sf', 'sourceeui_kbtu_sf', 'sourceeuiwn_kbtu_sf',
       'siteenergyuse_kbtu', 'siteenergyusewn_kbtu', 'steamuse_kbtu',
       'electricity_kwh', 'electricity_kbtu', 'naturalgas_therms',
       'naturalgas_kbtu', 'defaultdata', 'comments', 'compliancestatus',
       'outlier', 'totalghgem

In [25]:
# recast osebuildingid :

df.osebuildingid = df.osebuildingid.astype(str)
df.head(2)

Unnamed: 0,osebuildingid,datayear,buildingtype,primarypropertytype,propertyname,address,city,state,zipcode,taxparcelidentificationnumber,...,electricity_kwh,electricity_kbtu,naturalgas_therms,naturalgas_kbtu,defaultdata,comments,compliancestatus,outlier,totalghgemissions,ghgemissionsintensity
0,1,2016,NonResidential,Hotel,Mayflower park hotel,405 Olive way,Seattle,WA,98101.0,659000030,...,1156514.0,3946027.0,12764.5293,1276453.0,False,,Compliant,,249.98,2.83
1,2,2016,NonResidential,Hotel,Paramount Hotel,724 Pine street,Seattle,WA,98101.0,659000220,...,950425.2,3242851.0,51450.81641,5145082.0,False,,Compliant,,295.86,2.86


In [26]:
df.steamuse_kbtu = df.steamuse_kbtu.apply(lambda i: i > 0).astype(int)
df.electricity_kbtu = df.electricity_kbtu.apply(lambda i: i > 0).astype(int)
df.naturalgas_kbtu = df.naturalgas_kbtu.apply(lambda i: i > 0).astype(int)

In [27]:
# select columns for faster and better analysis


cols = [
    "primarypropertytype",
    "neighborhood",
    "latitude",
    "longitude",
    "yearbuilt",
    "numberofbuildings",
    "numberoffloors",
    "propertygfaparking",
    "propertygfabuilding",
    "energystarscore",
    "siteenergyuse_kbtu",
    "steamuse_kbtu",
    "electricity_kbtu",
    "naturalgas_kbtu",
]


df = df.loc[:, cols]
df.head(3)

Unnamed: 0,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu
0,Hotel,DOWNTOWN,47.6122,-122.33799,1927,1.0,12,0,88434,60.0,7226362.5,1,1,1
1,Hotel,DOWNTOWN,47.61317,-122.33393,1996,1.0,11,15064,88502,61.0,8387933.0,0,1,1
2,Hotel,DOWNTOWN,47.61393,-122.3381,1969,1.0,41,196718,759392,43.0,72587024.0,1,1,1


In [28]:
for c in df.select_dtypes(object):
    df[c] = df[c].str.lower()

In [29]:
df = df.loc[df.propertygfabuilding.notna()]
df = df.loc[df.propertygfabuilding > 1, :]

df = df.loc[df.siteenergyuse_kbtu.notna()]
df = df.loc[df.siteenergyuse_kbtu > 1, :]

df

Unnamed: 0,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu
0,hotel,downtown,47.61220,-122.33799,1927,1.0,12,0,88434,60.0,7.226362e+06,1,1,1
1,hotel,downtown,47.61317,-122.33393,1996,1.0,11,15064,88502,61.0,8.387933e+06,0,1,1
2,hotel,downtown,47.61393,-122.33810,1969,1.0,41,196718,759392,43.0,7.258702e+07,1,1,1
3,hotel,downtown,47.61412,-122.33664,1926,1.0,10,0,61320,56.0,6.794584e+06,1,1,1
4,hotel,downtown,47.61375,-122.34047,1980,1.0,18,62000,113580,75.0,1.417261e+07,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3371,office,greater duwamish,47.56722,-122.31154,1990,1.0,1,0,12294,46.0,8.497457e+05,0,1,1
3372,other,downtown,47.59625,-122.32283,2004,1.0,1,0,16000,,9.502762e+05,0,1,1
3373,other,magnolia / queen anne,47.63644,-122.35784,1974,1.0,1,0,13157,,5.765898e+06,0,1,1
3374,mixed use property,greater duwamish,47.52832,-122.32431,1989,1.0,1,0,14101,,7.194712e+05,0,1,1


In [30]:
df.numberofbuildings = df.numberofbuildings.fillna(1)
df.numberoffloors = df.numberoffloors.fillna(1)

# df.numberofbuildings = df.numberofbuildings.astype(int)
# df.numberoffloors = df.numberoffloors.astype(int)

df.numberofbuildings = df.numberofbuildings.replace({0: 1})
df.numberoffloors = df.numberoffloors.replace({0: 1})

# df.numberofbuildings = df.numberofbuildings.apply(lambda i : i if i>=1 else 1)
# df.numberoffloors = df.numberoffloors.apply(lambda i : i if i>=1 else 1)

df.propertygfaparking = df.propertygfaparking.apply(lambda i: i > 0).astype(int)
df["bool_energystarscore"] = df.energystarscore.isna().astype(int)

df["gfaperfloor"] = df.propertygfabuilding / df.numberoffloors
df["gfaperbuilding"] = df.propertygfabuilding / df.numberofbuildings

df.describe()

Unnamed: 0,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
count,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0,2532.0,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0
mean,47.623913,-122.334796,1968.569937,1.134805,4.729198,0.150015,86605.34,67.906003,5432676.0,0.038771,0.997614,0.627796,0.244855,26236.25,79907.18
std,0.047676,0.027158,33.135785,2.105019,5.505425,0.357139,207382.7,26.871011,21664940.0,0.193078,0.048795,0.483464,0.430066,166585.3,124694.7
min,47.49917,-122.41425,1900.0,1.0,1.0,0.0,3636.0,1.0,13409.0,0.0,0.0,0.0,0.0,221.697,3636.0
25%,47.59985,-122.35056,1948.0,1.0,2.0,0.0,27690.0,53.0,935396.7,0.0,1.0,0.0,0.0,8334.833,27140.0
50%,47.61861,-122.33249,1975.0,1.0,4.0,0.0,43162.0,75.0,1813404.0,0.0,1.0,1.0,0.0,13255.0,41834.0
75%,47.65701,-122.3195,1997.0,1.0,5.0,0.0,84385.0,90.0,4233753.0,0.0,1.0,1.0,0.0,24750.5,79957.0
max,47.73387,-122.220966,2015.0,111.0,99.0,1.0,9320156.0,100.0,873923700.0,1.0,1.0,1.0,1.0,9320156.0,2200000.0


In [31]:
# df.primarypropertytype.value_counts()
df.primarypropertytype = df.primarypropertytype.replace(
    {"office": "small- and mid-sized office"}
)
df.primarypropertytype.value_counts()

primarypropertytype
low-rise multifamily           983
mid-rise multifamily           564
small- and mid-sized office    295
other                          253
warehouse                      187
large office                   173
mixed use property             132
k-12 school                    125
high-rise multifamily          105
retail store                    91
hotel                           77
worship facility                71
distribution center             53
senior care community           45
supermarket / grocery store     40
medical office                  39
self-storage facility           28
university                      25
residence hall                  23
refrigerated warehouse          12
restaurant                      12
hospital                        10
laboratory                      10
Name: count, dtype: int64

## 2.1 Display

In [32]:
# display df :

EDA.first_tour.display(df)

----HEAD ----,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
0,hotel,downtown,47.6122,-122.33799,1927,1.0,12,0,88434,60.0,7226362.5,1,1,1,0,7369.5,88434.0
1,hotel,downtown,47.61317,-122.33393,1996,1.0,11,1,88502,61.0,8387933.0,0,1,1,0,8045.636364,88502.0
2,hotel,downtown,47.61393,-122.3381,1969,1.0,41,1,759392,43.0,72587024.0,1,1,1,0,18521.756098,759392.0
3,hotel,downtown,47.61412,-122.33664,1926,1.0,10,0,61320,56.0,6794584.0,1,1,1,0,6132.0,61320.0
4,hotel,downtown,47.61375,-122.34047,1980,1.0,18,1,113580,75.0,14172606.0,0,1,1,0,6310.0,113580.0






----SAMP ----,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
948,low-rise multifamily,ballard,47.66965,-122.39489,1975,1.0,4,0,30948,91.0,623322.3,0,1,0,0,7737.0,30948.0
3013,mid-rise multifamily,east,47.61621,-122.32241,1940,1.0,5,0,50356,99.0,2503804.0,0,1,1,0,10071.2,50356.0
2250,mid-rise multifamily,ballard,47.67624,-122.40974,1968,1.0,7,0,239054,48.0,12213420.0,0,1,1,0,34150.571429,239054.0
3132,small- and mid-sized office,northeast,47.68254,-122.26299,1960,1.0,1,0,21931,32.0,3947209.0,0,1,1,0,21931.0,21931.0
515,small- and mid-sized office,magnolia / queen anne,47.63156,-122.37551,1937,1.0,2,0,61929,59.0,2203026.0,0,1,1,0,30964.5,61929.0
1999,warehouse,greater duwamish,47.57833,-122.32563,1945,1.0,1,0,22972,12.0,495481.0,0,1,0,0,22972.0,22972.0
2859,low-rise multifamily,central,47.60136,-122.31248,2002,1.0,4,0,29808,80.0,975327.0,0,1,1,0,7452.0,29808.0
611,large office,downtown,47.60214,-122.32726,2009,1.0,17,1,275166,91.0,13631140.0,0,1,0,0,16186.235294,275166.0
1852,low-rise multifamily,magnolia / queen anne,47.62782,-122.36583,1907,1.0,4,0,38407,57.0,1208169.0,0,1,0,0,9601.75,38407.0
1854,low-rise multifamily,magnolia / queen anne,47.62862,-122.36827,1907,1.0,4,0,34498,100.0,910639.2,0,1,1,0,8624.5,34498.0






----TAIL ----,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
3371,small- and mid-sized office,greater duwamish,47.56722,-122.31154,1990,1.0,1,0,12294,46.0,849745.7,0,1,1,0,12294.0,12294.0
3372,other,downtown,47.59625,-122.32283,2004,1.0,1,0,16000,,950276.2,0,1,1,1,16000.0,16000.0
3373,other,magnolia / queen anne,47.63644,-122.35784,1974,1.0,1,0,13157,,5765898.0,0,1,1,1,13157.0,13157.0
3374,mixed use property,greater duwamish,47.52832,-122.32431,1989,1.0,1,0,14101,,719471.2,0,1,1,1,14101.0,14101.0
3375,mixed use property,greater duwamish,47.53939,-122.29536,1938,1.0,1,0,18258,,1152896.0,0,1,1,1,18258.0,18258.0






## 2.2 Structure

In [33]:
# info :

EDA.first_tour.info(df)

'shape (3353, 17), memory 0.48MB'





'---- FLO ----'

Unnamed: 0,cols,types,nan_sum,nan_mean,uniq_sum,uniq_rate,is_sku
0,latitude,float64,0,0.0,2858,0.85,False
1,longitude,float64,0,0.0,2637,0.79,False
2,numberofbuildings,float64,0,0.0,16,0.0,False
3,energystarscore,float64,821,0.24,100,0.03,False
4,siteenergyuse_kbtu,float64,0,0.0,3353,1.0,True
5,gfaperfloor,float64,0,0.0,3259,0.97,False
6,gfaperbuilding,float64,0,0.0,3177,0.95,False





'---- INT ----'

Unnamed: 0,cols,types,nan_sum,nan_mean,uniq_sum,uniq_rate,is_sku
0,yearbuilt,int64,0,0.0,113,0.03,False
1,numberoffloors,int64,0,0.0,49,0.01,False
2,propertygfaparking,int64,0,0.0,2,0.0,False
3,propertygfabuilding,int64,0,0.0,3171,0.95,False
4,steamuse_kbtu,int64,0,0.0,2,0.0,False
5,electricity_kbtu,int64,0,0.0,2,0.0,False
6,naturalgas_kbtu,int64,0,0.0,2,0.0,False
7,bool_energystarscore,int64,0,0.0,2,0.0,False





'---- OBJ ----'

Unnamed: 0,cols,types,nan_sum,nan_mean,uniq_sum,uniq_rate,is_sku
0,primarypropertytype,object,0,0.0,23,0.01,False
1,neighborhood,object,0,0.0,14,0.0,False


In [34]:
# usefull dtype info :

df.dtypes.value_counts()

int64      8
float64    7
object     2
Name: count, dtype: int64

## 2.3 NaN and duplicated

In [35]:
# # mnso :

# EDA.nan.viz(df)

In [36]:
# nan rate by columns :

EDA.nan.rate(df)

energystarscore         0.245
primarypropertytype     0.000
gfaperfloor             0.000
bool_energystarscore    0.000
naturalgas_kbtu         0.000
electricity_kbtu        0.000
steamuse_kbtu           0.000
siteenergyuse_kbtu      0.000
propertygfabuilding     0.000
neighborhood            0.000
propertygfaparking      0.000
numberoffloors          0.000
numberofbuildings       0.000
yearbuilt               0.000
longitude               0.000
latitude                0.000
gfaperbuilding          0.000
dtype: float64

In [37]:
# filter on poor columns :

EDA.nan.rate(df, threshold=0.75)

Series([], dtype: float64)

In [38]:
# same for lines :

EDA.nan.rate(df, axis=1, threshold=0.5)

Series([], dtype: float64)

In [39]:
# about nan distribution by column :

EDA.nan.rate(df, axis=1, threshold=0.0).value_counts(normalize=True).round(2)

0.000    0.76
0.059    0.24
Name: proportion, dtype: float64

In [40]:
# duplicated :

df.duplicated().sum()

0

## 2.4 Data Inspection

In [41]:
# describe per type :

EDA.first_tour.describe(df)




---- FLO ----,latitude,longitude,numberofbuildings,energystarscore,siteenergyuse_kbtu,gfaperfloor,gfaperbuilding
count,3353.0,3353.0,3353.0,2532.0,3353.0,3353.0,3353.0
mean,47.62,-122.33,1.13,67.91,5432676.0,26236.25,79907.18
std,0.05,0.03,2.11,26.87,21664940.0,166585.34,124694.68
min,47.5,-122.41,1.0,1.0,13409.0,221.7,3636.0
25%,47.6,-122.35,1.0,53.0,935396.7,8334.83,27140.0
50%,47.62,-122.33,1.0,75.0,1813404.0,13255.0,41834.0
75%,47.66,-122.32,1.0,90.0,4233753.0,24750.5,79957.0
max,47.73,-122.22,111.0,100.0,873923700.0,9320156.0,2200000.0





---- INT ----,yearbuilt,numberoffloors,propertygfaparking,propertygfabuilding,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore
count,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0,3353.0
mean,1968.57,4.73,0.15,86605.34,0.04,1.0,0.63,0.24
std,33.14,5.51,0.36,207382.71,0.19,0.05,0.48,0.43
min,1900.0,1.0,0.0,3636.0,0.0,0.0,0.0,0.0
25%,1948.0,2.0,0.0,27690.0,0.0,1.0,0.0,0.0
50%,1975.0,4.0,0.0,43162.0,0.0,1.0,1.0,0.0
75%,1997.0,5.0,0.0,84385.0,0.0,1.0,1.0,0.0
max,2015.0,99.0,1.0,9320156.0,1.0,1.0,1.0,1.0





---- OBJ ----,primarypropertytype,neighborhood
count,3353,3353
unique,23,14
top,low-rise multifamily,downtown
freq,983,570


In [42]:
# # global correlation :

# _ = EDA.study.corr(df)

In [43]:
# # about distribution :

# EDA.study.skew(df)

In [44]:
# # pairplot :
# # WARNING : very long computation, avoid if possible else uncomment

# frac = 0.05
# sns.pairplot(df.sample(frac=frac), corner=True)

In [45]:
# # about outliers :

# EDA.study.outlier(df)

In [46]:
# # multi colinearity :

# EDA.study.vif(df, scale=True)

# 2.5 ACP

In [47]:
# _df = EDA.study.outlier(df, display_=False)
# _df = _df.loc[_df._outlier<0.5]
# _df.shape

In [48]:
# X = _df.drop(columns="siteenergyuse_kbtu")
# y = _df.siteenergyuse_kbtu

In [49]:
# pca = EDA.pca(X)

In [50]:
# _ = pca.variance()

In [51]:
# _ = pca.pcs()

In [52]:
# pca.correlation_graph([0, 1])

In [53]:
# pca.factorial_planes([0, 1])

In [54]:
# pca.factorial_planes([0, 1], clusters="primarypropertytype")

In [55]:
# pca.factorial_planes([0, 1, 2])

In [56]:
# pca.factorial_planes([0, 1, 2], clusters="primarypropertytype")

In [57]:
# pca.factorial_planes([0, 1, 2], clusters="neighborhood")

# 3. Modelisation

In [58]:
## data split

In [59]:
df

Unnamed: 0,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,siteenergyuse_kbtu,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
0,hotel,downtown,47.61220,-122.33799,1927,1.0,12,0,88434,60.0,7.226362e+06,1,1,1,0,7369.500000,88434.0
1,hotel,downtown,47.61317,-122.33393,1996,1.0,11,1,88502,61.0,8.387933e+06,0,1,1,0,8045.636364,88502.0
2,hotel,downtown,47.61393,-122.33810,1969,1.0,41,1,759392,43.0,7.258702e+07,1,1,1,0,18521.756098,759392.0
3,hotel,downtown,47.61412,-122.33664,1926,1.0,10,0,61320,56.0,6.794584e+06,1,1,1,0,6132.000000,61320.0
4,hotel,downtown,47.61375,-122.34047,1980,1.0,18,1,113580,75.0,1.417261e+07,0,1,1,0,6310.000000,113580.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3371,small- and mid-sized office,greater duwamish,47.56722,-122.31154,1990,1.0,1,0,12294,46.0,8.497457e+05,0,1,1,0,12294.000000,12294.0
3372,other,downtown,47.59625,-122.32283,2004,1.0,1,0,16000,,9.502762e+05,0,1,1,1,16000.000000,16000.0
3373,other,magnolia / queen anne,47.63644,-122.35784,1974,1.0,1,0,13157,,5.765898e+06,0,1,1,1,13157.000000,13157.0
3374,mixed use property,greater duwamish,47.52832,-122.32431,1989,1.0,1,0,14101,,7.194712e+05,0,1,1,1,14101.000000,14101.0


In [60]:
data = Data.dataclass(df, "siteenergyuse_kbtu", test_size=0.33)
data

(0, 2246)
(2246, None)
(-1, -1)
(0, 2246)
(2246, None)
(-1, -1)
(0, 2246)
(2246, None)
(-1, -1)
(0, 2246)
(2246, None)
(-1, -1)
(0, 2246)
(2246, None)
(-1, -1)


  self._train_tuple = train_tuple



DataClass(X=3353, train.X=2246, X.train=2246
X_train_size=2246, X_test_size=1107, X_val_size=0
y_train_size=2246, y_test_size=1107, y_val_size=0 )

In [61]:
data.X

(0, 2246)
(2246, None)
(-1, -1)


Unnamed: 0,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
1945,small- and mid-sized office,central,47.59973,-122.31331,1947,1.0,1,0,45068,100.0,0,1,1,0,45068.000000,45068.0
2252,other,greater duwamish,47.54089,-122.32323,1962,1.0,3,0,36140,,0,1,0,1,12046.666667,36140.0
2243,low-rise multifamily,north,47.69834,-122.32788,1995,1.0,2,0,20261,96.0,0,1,0,0,10130.500000,20261.0
714,low-rise multifamily,magnolia / queen anne,47.62563,-122.35596,2000,1.0,3,0,37619,62.0,0,1,1,0,12539.666667,37619.0
1980,low-rise multifamily,southeast,47.52971,-122.26960,2010,1.0,4,0,88043,82.0,0,1,1,0,22010.750000,88043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3083,low-rise multifamily,north,47.71025,-122.32020,1970,1.0,2,0,30984,56.0,0,1,0,0,15492.000000,30984.0
1112,low-rise multifamily,southwest,47.56584,-122.38655,2008,1.0,4,0,40326,57.0,0,1,1,0,10081.500000,40326.0
3226,low-rise multifamily,magnolia / queen anne,47.62365,-122.35832,2013,1.0,4,0,29158,100.0,0,1,1,0,7289.500000,29158.0
652,k-12 school,northeast,47.69054,-122.29705,1955,1.0,1,0,46732,87.0,0,1,1,0,46732.000000,46732.0


In [62]:
data.y

YY()

In [63]:
data.train.X

Unnamed: 0,primarypropertytype,neighborhood,latitude,longitude,yearbuilt,numberofbuildings,numberoffloors,propertygfaparking,propertygfabuilding,energystarscore,steamuse_kbtu,electricity_kbtu,naturalgas_kbtu,bool_energystarscore,gfaperfloor,gfaperbuilding
1945,small- and mid-sized office,central,47.59973,-122.31331,1947,1.0,1,0,45068,100.0,0,1,1,0,45068.000000,45068.0
2252,other,greater duwamish,47.54089,-122.32323,1962,1.0,3,0,36140,,0,1,0,1,12046.666667,36140.0
2243,low-rise multifamily,north,47.69834,-122.32788,1995,1.0,2,0,20261,96.0,0,1,0,0,10130.500000,20261.0
714,low-rise multifamily,magnolia / queen anne,47.62563,-122.35596,2000,1.0,3,0,37619,62.0,0,1,1,0,12539.666667,37619.0
1980,low-rise multifamily,southeast,47.52971,-122.26960,2010,1.0,4,0,88043,82.0,0,1,1,0,22010.750000,88043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,large office,lake union,47.64882,-122.34905,1999,1.0,4,1,140302,89.0,0,1,1,0,35075.500000,140302.0
2489,warehouse,greater duwamish,47.55317,-122.33289,1964,1.0,1,0,25000,,0,1,1,1,25000.000000,25000.0
546,large office,lake union,47.64912,-122.34823,2008,1.0,4,1,113768,73.0,0,1,0,0,28442.000000,113768.0
1947,other,downtown,47.60791,-122.34140,1982,1.0,5,1,42360,,0,1,0,1,8472.000000,42360.0


In [64]:
data.train.y

1945    2.848573e+05
2252    4.653535e+06
2243    4.889915e+05
714     1.119640e+06
1980    2.468322e+06
            ...     
372     8.651835e+06
2489    1.989104e+06
546     8.873485e+06
1947    1.527696e+06
1767    1.045838e+06
Name: siteenergyuse_kbtu, Length: 2246, dtype: float64

In [65]:
token = secrets.token_hex(4)
display(token)

pipe = Pipeline(
    [
        ("imputer", KNNImputer()),
        ("scaler", StandardScaler()),
        ("estimator", RandomForestClassifier()),
    ]
)

param_grid = {
    "scaler": [
        StandardScaler(),
        # QuantileTransformer(n_quantiles=100),
        # Normalizer(),
        "passthrough",
    ],
    "estimator": [
        DummyRegressor(),
        # RandomForestRegressor(),
        # XGBRegressor(),
        LinearRegression(),
    ],
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=3,
    n_jobs=1,
    verbose=1,
    return_train_score=True,
)

grid

'2c95b007'

In [66]:
grid.fit(data.X.train.select_dtypes(include=np.number), data.y.train)

results.update(
    grid,
    token=token,
    y_log=0,
    get_dummies=0,
    verbose=0,
).head(5)

(0, 2246)
(2246, None)
(-1, -1)
Fitting 3 folds for each of 4 candidates, totalling 12 fits


  self._train_tuple = train_tuple


Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
2,0.05,0.01,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,0.56,0.08,0.66,0.05
3,0.04,0.01,LinearRegression(),passthrough,"{'estimator': LinearRegression(), 'scaler': 'p...",1.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,0.56,0.08,0.66,0.05
0,0.03,0.01,DummyRegressor(),StandardScaler(),"{'estimator': DummyRegressor(), 'scaler': Stan...",3.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,-0.01,0.01,0.0,0.0
1,0.02,0.0,DummyRegressor(),passthrough,"{'estimator': DummyRegressor(), 'scaler': 'pas...",3.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,-0.01,0.01,0.0,0.0


In [68]:
results.res

Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
2,0.0482,0.0061,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,0.5615,0.078,0.6598,0.0518
3,0.0377,0.0095,LinearRegression(),passthrough,"{'estimator': LinearRegression(), 'scaler': 'p...",1.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,0.5615,0.078,0.6598,0.0518
0,0.0312,0.013,DummyRegressor(),StandardScaler(),"{'estimator': DummyRegressor(), 'scaler': Stan...",3.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,-0.0061,0.0072,0.0,0.0
1,0.025,0.0007,DummyRegressor(),passthrough,"{'estimator': DummyRegressor(), 'scaler': 'pas...",3.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,-0.0061,0.0072,0.0,0.0


In [69]:
results.RES

Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
2,0.0482,0.0061,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,,5bad8b6e,0,0,0.5615,0.078,0.6598,0.0518


In [78]:
token = secrets.token_hex(4)
display(token)


##############


pipe = Pipeline(
    [
        ("imputer", KNNImputer()),
        ("scaler", StandardScaler()),
        ("estimator", RandomForestClassifier()),
    ]
)


##############


param_grid = {
    "scaler": [
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        Normalizer(),
        "passthrough",
    ],
    "estimator": [
        DummyRegressor(),
        RandomForestRegressor(),
        XGBRegressor(),
        LinearRegression(),
    ],
}


##############


log_list = [
    (0, data.y.train),
    (1, np.log1p(data.y.train)),
]
dummies = [
    (0, data.X.train.select_dtypes(include=np.number)),
    (1, pd.get_dummies(data.X.train)),
]


for log, _y in log_list:
    for dummy, _X in dummies:

        logging.warning(log)
        logging.warning(dummy)
        
        grid = GridSearchCV(
            pipe,
            param_grid=param_grid,
            cv=10,
            n_jobs=1,
            verbose=1,
            return_train_score=True,
        )

        grid.fit(_X, _y)

        results.update(
            grid,
            token=token,
            y_log=log,
            get_dummies=dummy,
            verbose=1,
        ).head(5)

'bbe54d25'

  self._train_tuple = train_tuple


(0, 2246)
(2246, None)
(-1, -1)
(0, 2246)
(2246, None)
(-1, -1)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
12,0.06,0.0,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.49,0.48,0.65,0.04
15,0.07,0.01,LinearRegression(),passthrough,"{'estimator': LinearRegression(), 'scaler': 'p...",2.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.49,0.48,0.65,0.04
6,1.18,0.01,RandomForestRegressor(),Normalizer(),"{'estimator': RandomForestRegressor(), 'scaler...",3.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.39,0.46,0.94,0.01
5,0.9,0.03,RandomForestRegressor(),QuantileTransformer(n_quantiles=100),"{'estimator': RandomForestRegressor(), 'scaler...",4.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.17,0.93,0.92,0.01
4,0.87,0.01,RandomForestRegressor(),StandardScaler(),"{'estimator': RandomForestRegressor(), 'scaler...",5.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.12,1.02,0.92,0.01
13,0.07,0.01,LinearRegression(),QuantileTransformer(n_quantiles=100),"{'estimator': LinearRegression(), 'scaler': Qu...",6.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.03,0.3,0.24,0.02
14,0.06,0.0,LinearRegression(),Normalizer(),"{'estimator': LinearRegression(), 'scaler': No...",7.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,0.01,0.49,0.33,0.02
0,0.05,0.01,DummyRegressor(),StandardScaler(),"{'estimator': DummyRegressor(), 'scaler': Stan...",8.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,-0.01,0.02,0.0,0.0
1,0.06,0.0,DummyRegressor(),QuantileTransformer(n_quantiles=100),"{'estimator': DummyRegressor(), 'scaler': Quan...",8.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,-0.01,0.02,0.0,0.0
2,0.05,0.0,DummyRegressor(),Normalizer(),"{'estimator': DummyRegressor(), 'scaler': Norm...",8.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,,e96d031d,0,0,-0.01,0.02,0.0,0.0




Fitting 10 folds for each of 16 candidates, totalling 160 fits


Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
12,0.11,0.01,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.62,0.31,0.73,0.04
15,0.11,0.01,LinearRegression(),passthrough,"{'estimator': LinearRegression(), 'scaler': 'p...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.62,0.31,0.73,0.04
6,1.9,0.02,RandomForestRegressor(),Normalizer(),"{'estimator': RandomForestRegressor(), 'scaler...",3.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.55,0.34,0.95,0.01
4,1.44,0.04,RandomForestRegressor(),StandardScaler(),"{'estimator': RandomForestRegressor(), 'scaler...",4.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.43,0.7,0.94,0.01
10,0.3,0.01,"XGBRegressor(base_score=None, booster=None, ca...",Normalizer(),"{'estimator': XGBRegressor(base_score=None, bo...",5.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.41,0.64,1.0,0.0
13,0.14,0.01,LinearRegression(),QuantileTransformer(n_quantiles=100),"{'estimator': LinearRegression(), 'scaler': Qu...",6.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.41,0.23,0.53,0.04
7,1.38,0.01,RandomForestRegressor(),passthrough,"{'estimator': RandomForestRegressor(), 'scaler...",7.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.38,0.76,0.94,0.01
5,1.41,0.01,RandomForestRegressor(),QuantileTransformer(n_quantiles=100),"{'estimator': RandomForestRegressor(), 'scaler...",8.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.35,0.9,0.94,0.01
8,0.29,0.01,"XGBRegressor(base_score=None, booster=None, ca...",StandardScaler(),"{'estimator': XGBRegressor(base_score=None, bo...",9.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.16,1.23,1.0,0.0
11,0.28,0.01,"XGBRegressor(base_score=None, booster=None, ca...",passthrough,"{'estimator': XGBRegressor(base_score=None, bo...",10.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,,a1122bc2,0,1,0.16,1.23,1.0,0.0




Fitting 10 folds for each of 16 candidates, totalling 160 fits


Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
4,0.85,0.03,RandomForestRegressor(),StandardScaler(),"{'estimator': RandomForestRegressor(), 'scaler...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.71,0.04,0.96,0.0
5,0.81,0.01,RandomForestRegressor(),QuantileTransformer(n_quantiles=100),"{'estimator': RandomForestRegressor(), 'scaler...",2.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.71,0.04,0.96,0.0
7,0.8,0.0,RandomForestRegressor(),passthrough,"{'estimator': RandomForestRegressor(), 'scaler...",3.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.71,0.04,0.96,0.0
11,0.27,0.01,"XGBRegressor(base_score=None, booster=None, ca...",passthrough,"{'estimator': XGBRegressor(base_score=None, bo...",4.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.69,0.04,0.99,0.0
8,0.28,0.01,"XGBRegressor(base_score=None, booster=None, ca...",StandardScaler(),"{'estimator': XGBRegressor(base_score=None, bo...",5.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.69,0.04,0.99,0.0
6,1.0,0.01,RandomForestRegressor(),Normalizer(),"{'estimator': RandomForestRegressor(), 'scaler...",6.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.69,0.04,0.96,0.0
9,0.28,0.01,"XGBRegressor(base_score=None, booster=None, ca...",QuantileTransformer(n_quantiles=100),"{'estimator': XGBRegressor(base_score=None, bo...",7.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.69,0.04,0.99,0.0
10,0.28,0.01,"XGBRegressor(base_score=None, booster=None, ca...",Normalizer(),"{'estimator': XGBRegressor(base_score=None, bo...",8.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.67,0.04,0.97,0.0
13,0.07,0.01,LinearRegression(),QuantileTransformer(n_quantiles=100),"{'estimator': LinearRegression(), 'scaler': Qu...",9.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.66,0.03,0.66,0.0
14,0.06,0.01,LinearRegression(),Normalizer(),"{'estimator': LinearRegression(), 'scaler': No...",10.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,,1c6c1212,1,0,0.62,0.03,0.63,0.0




Fitting 10 folds for each of 16 candidates, totalling 160 fits


Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,cell,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score
11,0.3,0.01,"XGBRegressor(base_score=None, booster=None, ca...",passthrough,"{'estimator': XGBRegressor(base_score=None, bo...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.8,0.03,0.99,0.0
8,0.3,0.02,"XGBRegressor(base_score=None, booster=None, ca...",StandardScaler(),"{'estimator': XGBRegressor(base_score=None, bo...",2.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.8,0.03,0.99,0.0
9,0.3,0.01,"XGBRegressor(base_score=None, booster=None, ca...",QuantileTransformer(n_quantiles=100),"{'estimator': XGBRegressor(base_score=None, bo...",3.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.8,0.03,0.99,0.0
4,1.17,0.02,RandomForestRegressor(),StandardScaler(),"{'estimator': RandomForestRegressor(), 'scaler...",5.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.78,0.03,0.97,0.0
7,1.14,0.0,RandomForestRegressor(),passthrough,"{'estimator': RandomForestRegressor(), 'scaler...",4.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.78,0.03,0.97,0.0
10,0.33,0.01,"XGBRegressor(base_score=None, booster=None, ca...",Normalizer(),"{'estimator': XGBRegressor(base_score=None, bo...",6.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.78,0.04,0.98,0.0
5,1.16,0.01,RandomForestRegressor(),QuantileTransformer(n_quantiles=100),"{'estimator': RandomForestRegressor(), 'scaler...",7.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.78,0.03,0.97,0.0
6,1.54,0.01,RandomForestRegressor(),Normalizer(),"{'estimator': RandomForestRegressor(), 'scaler...",8.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.77,0.04,0.97,0.0
13,0.16,0.01,LinearRegression(),QuantileTransformer(n_quantiles=100),"{'estimator': LinearRegression(), 'scaler': Qu...",9.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.76,0.04,0.77,0.0
15,0.15,0.02,LinearRegression(),passthrough,"{'estimator': LinearRegression(), 'scaler': 'p...",10.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,,a15a8590,1,1,0.67,0.06,0.71,0.01


In [79]:
results.RES

Unnamed: 0,mean_fit_time,std_fit_time,param_estimator,param_scaler,params,rank_val_score,grid,best_estimator_,datetime,token,...,model_id,y_log,get_dummies,mean_val_score,std_val_score,mean_train_score,std_train_score,run,exp,date
9,0.3036,0.0132,"XGBRegressor(base_score=None, booster=None, ca...",passthrough,"{'estimator': XGBRegressor(base_score=None, bo...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, XGBRegressor(base_...",2023-05-02 13:08:04,d2892e19,...,a15a8590,1,1,0.7982,0.0287,0.9879,0.0011,,,
0,0.8546,0.0259,RandomForestRegressor(),StandardScaler(),"{'estimator': RandomForestRegressor(), 'scaler...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), (DecisionTree...",2023-05-02 13:06:21,8eee30a9,...,1c6c1212,1,0,0.7101,0.036,0.9597,0.0009,,,
1,0.1096,0.0055,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:38:58,fa3b4b13,...,d565aa95,0,1,0.6197,0.3137,0.7344,0.0372,,,
2,0.1088,0.0099,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 13:05:13,10551142,...,a1122bc2,0,1,0.6197,0.3137,0.7344,0.0372,,,
3,0.0482,0.0061,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:12:48,112ab94a,...,5bad8b6e,0,0,0.5615,0.078,0.6598,0.0518,32f5a199,9f7d9144,2023-05-02 12:11:17
4,0.028,0.0014,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:24:16,439ba5d0,...,16a41809,1,0,0.5615,0.078,0.6598,0.0518,,,
5,0.0309,0.0015,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",2.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), passthrough, LinearRegression())",2023-05-02 12:25:04,67203a3f,...,ff4d2f08,0,0,0.5615,0.078,0.6598,0.0518,,,
6,0.0285,0.0006,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",1.0,"GridSearchCV(cv=3,\n estimator=Pip...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 12:25:05,e37c90a2,...,f1dba282,1,1,0.5504,0.0026,0.58,0.0045,,,
7,0.0639,0.0091,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 12:37:10,aa0b554b,...,3878ea3e,0,0,0.4921,0.4788,0.6534,0.0386,,,
8,0.0591,0.0037,LinearRegression(),StandardScaler(),"{'estimator': LinearRegression(), 'scaler': St...",1.0,"GridSearchCV(cv=10,\n estimator=Pi...","(KNNImputer(), StandardScaler(), LinearRegress...",2023-05-02 13:03:25,bfedf07e,...,e96d031d,0,0,0.4921,0.4788,0.6534,0.0386,,,


Unnamed: 0_level_0,best_estimator_,mean_val_score
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1
a15a8590,"(KNNImputer(), passthrough, XGBRegressor(base_...",0.7982
1c6c1212,"(KNNImputer(), StandardScaler(), (DecisionTree...",0.7101
a1122bc2,"(KNNImputer(), passthrough, LinearRegression())",0.6197
d565aa95,"(KNNImputer(), passthrough, LinearRegression())",0.6197
16a41809,"(KNNImputer(), passthrough, LinearRegression())",0.5615
5bad8b6e,"(KNNImputer(), passthrough, LinearRegression())",0.5615
ff4d2f08,"(KNNImputer(), passthrough, LinearRegression())",0.5615
f1dba282,"(KNNImputer(), StandardScaler(), LinearRegress...",0.5504
3878ea3e,"(KNNImputer(), StandardScaler(), LinearRegress...",0.4921
e96d031d,"(KNNImputer(), StandardScaler(), LinearRegress...",0.4921


0
