In [None]:
import pandas as pd
import numpy as np
!pip install pandas-profiling==2.*
!pip install category_encoders==2.*



In [None]:
col_list = ['room_type', 'bathrooms', 'bedrooms', 'accommodates', 
            'beds', 'property_type', 'cancellation_policy', 'bed_type','log_price']
df = pd.read_csv('train.csv', skiprows=[26044])[col_list]
df.head()

Unnamed: 0,room_type,bathrooms,bedrooms,accommodates,beds,property_type,cancellation_policy,bed_type,log_price
0,Entire home/apt,1.0,1.0,3,1.0,Apartment,strict,Real Bed,5.010635
1,Entire home/apt,1.0,3.0,7,3.0,Apartment,strict,Real Bed,5.129899
2,Entire home/apt,1.0,1.0,5,3.0,Apartment,moderate,Real Bed,4.976734
3,Entire home/apt,1.0,2.0,4,2.0,House,flexible,Real Bed,6.620073
4,Entire home/apt,1.0,0.0,2,1.0,Apartment,moderate,Real Bed,4.744932


In [None]:
import matplotlib.pyplot as plt
import datetime as dt
from pandas_profiling import ProfileReport
import re
# profile = ProfileReport(df, minimal=True).to_notebook_iframe()
# profile

In [None]:
def reduceCardinality(series):
  top10 = series.value_counts()[:10].index
  return series.apply(lambda x: 'Other' if x not in top10 else x)

def wrangle(df):
  df['property_type'] = reduceCardinality(df['property_type'])

  return df
df_trainable = wrangle(df)
df_trainable.head()

Unnamed: 0,room_type,bathrooms,bedrooms,accommodates,beds,property_type,cancellation_policy,bed_type,log_price
0,Entire home/apt,1.0,1.0,3,1.0,Apartment,strict,Real Bed,5.010635
1,Entire home/apt,1.0,3.0,7,3.0,Apartment,strict,Real Bed,5.129899
2,Entire home/apt,1.0,1.0,5,3.0,Apartment,moderate,Real Bed,4.976734
3,Entire home/apt,1.0,2.0,4,2.0,House,flexible,Real Bed,6.620073
4,Entire home/apt,1.0,0.0,2,1.0,Apartment,moderate,Real Bed,4.744932


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from category_encoders import OneHotEncoder
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

#Non-performant models
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.linear_model import Ridge
# from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
# sklearn.ensemble import VotingRegressor #TODO

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.metrics import mean_squared_error as MSE

In [None]:
X = df_trainable.drop('log_price',axis=1)
y = df_trainable.log_price
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.25,random_state=42)

In [None]:
scalers = [StandardScaler(), MinMaxScaler()]
models = [XGBRegressor()
          ]
encoders = [OneHotEncoder(),OrdinalEncoder()]
imputers = [SimpleImputer(), IterativeImputer()]

In [None]:
import joblib
# file = joblib.dump(pipe, 'pipe.pkl', )
# print(file)
# pipeNew = joblib.load('pipe.pkl')

In [None]:
params = {'model__n_estimators':np.linspace(50,150,3,dtype='int64'),
          'model__criterion':['mae','mse'],
          # 'model__max_depth':np.linspace(int(len(X_train) * (4/9)),int(len(X_train) * (6/9))
          #                                 ,3,dtype='int64').tolist(),
          'model__min_samples_split':np.linspace(2,750,3,dtype='int64'),
          'model__min_samples_leaf': np.linspace(1,5,3,dtype='int64'),
          'model__min_weight_fraction_leaf': np.linspace(0,0.4,3),
          'model__max_features':np.linspace(0.8,0.999,3).tolist(),
          'model__max_leaf_nodes':[None] + np.linspace(int(len(X_train) * (1/4)),int(len(X_train) * (8/9)),
                                              3,dtype='int64').tolist(),
          'model__min_impurity_decrease':np.linspace(0,0.1,3),
          'model__bootstrap':[True,False],
          'model__ccp_alpha':np.linspace(0.8,0.999,3).tolist(),
          }

In [None]:
params = {'model__kernel':['linear','poly','rbf','sigmoid'],
          'model__degree':np.arange(2,6),
          'model__gamma':['scale','auto'],
          'model__coef0':np.linspace(0,1,3),
          'model__C':list(np.linspace(0.1,2,3)) + [1.0],
          'model__epsilon':list(np.linspace(0.01,1,3)) + [1.0],
          'model__shrinking':[True,False]}

In [None]:
params = {'model__booster':['gbtree','dart'],
          'model__max_depth':np.arange(2,10),
          'model__n_estimators':np.arange(60,220,40),
          'model__learning_rate':np.linspace(0.01,0.1,3),
          }

In [None]:
performant_pipes = []
saved_files = []
prev_score = 0.2411
pipe = Pipeline([
                ('encode',OrdinalEncoder()),
                ('impute', SimpleImputer()),
                ('scale',MinMaxScaler()),
                ('model',XGBRegressor())
                ])
grid = grid = RandomizedSearchCV(pipe,param_distributions =params,n_jobs=-1,cv = 5,random_state=91)
grid.fit(X_train,y_train)
y_pred = grid.predict(X_val)
score = MSE(y_pred,y_val)
if score < prev_score:
  prev_score = score
  # print(score)
  # print(pipe)
  performant_pipes.append(grid)
  pipe = grid.best_estimator_
  fileName = '_'.join([str(score)[:6],'grid.joblib'])
  pipefile = joblib.dump(pipe[:-1], fileName)
  saved_files.append(pipefile)

In [None]:
fileName = 'grid_XGB.joblib'
joblib.dump(grid, fileName)
grid2 = joblib.load(fileName)


In [None]:
y_pred = grid2.predict(X_val)
MSE(y_val, y_pred)

0.25214087730507234

In [None]:
print(score)
print(pipe)
performant_pipes.append(pipe)
fileName = '_'.join([str(score)[:6],'model_','.joblib'])
file = joblib.dump(pipe, fileName)
saved_files.append(file)

0.24013775908768448
Pipeline(memory=None,
         steps=[('encode',
                 OrdinalEncoder(cols=['room_type', 'property_type',
                                      'cancellation_policy', 'bed_type'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'room_type',
                                          'data_type': dtype('O'),
                                          'mapping': Entire home/apt    1
Private room       2
Shared room        3
NaN               -2
dtype: int64},
                                         {'col': 'property_type',
                                          'data_type': dtype('O')...
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain',
                              learning_rate=0.055000000

In [None]:
import joblib
grid = joblib.load('')