In [192]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline
sns.set_style('darkgrid')

In [193]:
df_train = pd.read_csv("/content/train.csv", index_col='Id')
df_test = pd.read_csv("/content/sample_submission.csv", usecols=[0,1])

In [194]:
df = pd.merge(df_train, df_test, on="Id")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Actual_SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [195]:
df_test.shape

(1460, 2)

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                1460 non-null   int64  
 1   MSSubClass        1460 non-null   int64  
 2   MSZoning          1460 non-null   object 
 3   LotFrontage       1201 non-null   float64
 4   LotArea           1460 non-null   int64  
 5   Street            1460 non-null   object 
 6   Alley             91 non-null     object 
 7   LotShape          1460 non-null   object 
 8   LandContour       1460 non-null   object 
 9   Utilities         1460 non-null   object 
 10  LotConfig         1460 non-null   object 
 11  LandSlope         1460 non-null   object 
 12  Neighborhood      1460 non-null   object 
 13  Condition1        1460 non-null   object 
 14  Condition2        1460 non-null   object 
 15  BldgType          1460 non-null   object 
 16  HouseStyle        1460 non-null   object 


In [197]:
print("Train data's size: ", df_train.shape)
print("Test data's size: ", df_test.shape)
print("Concanated Data size:", df.shape)

Train data's size:  (1460, 79)
Test data's size:  (1460, 2)
Concanated Data size: (1460, 81)


In [198]:
numCols = list(df_train.select_dtypes(exclude='object').columns)
print(f"There are {len(numCols)} numerical features:\n", numCols)

There are 36 numerical features:
 ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [199]:
catCols = list(df_train.select_dtypes(include='object').columns)
print(f"There are {len(catCols)} numerical features:\n", catCols)

There are 43 numerical features:
 ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [200]:
#Top 10 numerical variables highly correlated with SalePrice:

corr_mat = df.corr().Actual_SalePrice.sort_values(ascending=False)
corr_mat.head(11)

Actual_SalePrice    1.000000
OverallQual         0.790982
GrLivArea           0.708624
GarageCars          0.640409
GarageArea          0.623431
TotalBsmtSF         0.613581
1stFlrSF            0.605852
FullBath            0.560664
TotRmsAbvGrd        0.533723
YearBuilt           0.522897
YearRemodAdd        0.507101
Name: Actual_SalePrice, dtype: float64

***Data Processing***

In [201]:
missing_data_count = df.isnull().sum()
missing_data_percent = df.isnull().sum() / len(df) * 100

missing_data = pd.DataFrame({
    'Count': missing_data_count,
    'Percent': missing_data_percent
})
missing_data = missing_data[missing_data.Count > 0]
missing_data.sort_values(by='Count', ascending=False, inplace=True)

print(f"There are {missing_data.shape[0]} features having missing data.\n")
print("Top 10 missing value features:")
missing_data.head(10)

There are 19 features having missing data.

Top 10 missing value features:


Unnamed: 0,Count,Percent
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageType,81,5.547945
GarageYrBlt,81,5.547945
GarageFinish,81,5.547945
GarageQual,81,5.547945


In [202]:
from sklearn.impute import SimpleImputer
# Group 1:
group_1 = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'
]
df[group_1] = df[group_1].fillna("None")

# Group 2:
group_2 = [
    'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'
]

df[group_2] = df[group_2].fillna(0)

# Group 3:
group_3a = [
    'Functional', 'MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st',
    'Exterior2nd', 'SaleType', 'Utilities'
]

imputer = SimpleImputer(strategy='most_frequent')
df[group_3a] = pd.DataFrame(imputer.fit_transform(df[group_3a]), index=df.index)

df.LotFrontage = df.LotFrontage.fillna(df.LotFrontage.mean())
df.GarageYrBlt = df.GarageYrBlt.fillna(df.YearBuilt)

In [203]:
#Let's check whether there is any missing value left:
sum(df.isnull().sum())

0

In [204]:
#There are two observations lying separately from the rest. They have large living area but low price. They are the outliers that we are looking for. I will delete them from the training set.

outlier_index = df_train[(df_train.GrLivArea > 4000)].index
df.drop(outlier_index, axis=0, inplace=True)
df_test.drop(outlier_index, axis=0, inplace=True)

### **3.3. Feature Engineering**

3.3.1. Create New Variables
In this step I will create new features from weaker features in the training data. For example, the surface area of each floor has low correlation with house price; however, when we sum them up, the relationship becomes much stronger. In fact, TotalSqFeet becomes the strongest feature in the dataset. The new features I will create are total square feet, total number of bathrooms, age of the house, whether the house was remodeled, and whether the house was sold in the same year it was built.

In [205]:
df['totalSqFeet'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['totalBathroom'] = df.FullBath + df.BsmtFullBath + 0.5 * (df.HalfBath + df.BsmtHalfBath)
df['houseAge'] = df.YrSold - df.YearBuilt
df['reModeled'] = np.where(df.YearRemodAdd == df.YearBuilt, 0, 1)
df['isNew'] = np.where(df.YrSold == df.YearBuilt, 1, 0)

### 3.3.2. Label Encoding
Ordinal categorical features are label encoded. **bold text**

In [206]:
from sklearn.preprocessing import LabelEncoder

# Ordinal categorical columns
label_encoding_cols = [
    "Alley", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "BsmtQual", "ExterCond", "ExterQual", "FireplaceQu", "Functional",
    "GarageCond", "GarageQual", "HeatingQC", "KitchenQual", "LandSlope",
    "LotShape", "PavedDrive", "PoolQC", "Street", "Utilities"
]

# Apply Label Encoder
label_encoder = LabelEncoder()

for col in label_encoding_cols:
    df[col] = label_encoder.fit_transform(df[col])

### 3.3.3. Transform Numerical Variables to Categorical Variables
Because I have calculated age of houses, YearBuilt is no longer needed. However, YrSold could have a large impact on house price (e.g. In economic crisis years, house price could be lower). Therefore, I will transform it into categorical variables.

Like YrSold, some numerical variables don't have any ordinal meaning (e.g. MoSold, MSSubClass). I will transform them into categorical variables. 

In [207]:
to_factor_cols = ['YrSold', 'MoSold', 'MSSubClass']

for col in to_factor_cols:
    df[col] = df[col].apply(str)

### 3.5. Feature Scaling
Except for Decision Tree and Random Forest, it is highly recommended to standardize the data set before running machine learning algorithms since optimization methods and gradient descent run and converge faster on similarly scaled features.

However, outliers can often influence the sample mean and standard deviation in a negative way, and models like Lasso and Elastic Net are very sensitive to outliers. In such cases, the median and the interquartile range often give better results. I will use RobustScaler to transform the training data. 

In [208]:
from sklearn.preprocessing import RobustScaler
numerical_cols = list(df.select_dtypes(exclude=['object']).columns)
scaler = RobustScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

### 3.6. One-hot Encoding

In [209]:
X = pd.get_dummies(df, drop_first=True)
print("X.shape:", X.shape)
y = df_test["Actual_SalePrice"]

X.shape: (1456, 239)


In [210]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [211]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(975, 239)
(481, 239)
(975,)
(481,)


In [212]:
from sklearn.metrics import mean_squared_error

# Helper RMSE function
def rmse(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    return np.sqrt(mse)

In [213]:
from xgboost import XGBRegressor

In [214]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [215]:
def custom_function(Y_train, Y_pred):
    RMSE=(round(mean_squared_error((Y_train),(Y_pred), squared=False), 4))
    return RMSE

scorer=make_scorer(custom_function,greater_is_better=False)

In [216]:
import xgboost as xgb
reg = xgb.XGBRegressor()
reg.fit(X_train, y_train)
Y_pred = reg.predict(X_test)
print(reg.score(X_train,y_train))
print("RMSE: ",round(mean_squared_error((y_test),(Y_pred), squared=False), 4))

0.9999331837694213
RMSE:  1112.6481


In [217]:
from sklearn.metrics import mean_squared_error

# Helper RMSE function
def rmse(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    return np.sqrt(mse)
# Compute RMSE of linear regression model
rmse(reg, X_test, y_test)

1112.6480508479199

### **2nd Approach**

In [218]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/sample_submission.csv", usecols=[0,1])
df1 = pd.merge(df_train, df_test, on="Id")

In [219]:
train= df1[~((df1['BsmtFinSF2']>1200) |(df1['ScreenPorch']>350)|(df1['GrLivArea']>4000)|(df1['OpenPorchSF']>350)|(df1['EnclosedPorch']>350)| (df1['BsmtFinSF1']>3000) |(df1['MasVnrArea']>1200) |(df1['LotArea']>100000) | (df1['LotFrontage']>200))]

In [220]:
train.shape

(1432, 81)

In [221]:
df2=pd.DataFrame({'Type': df1.dtypes,
                  'Missing': df1.isna().sum(),
                  'Size':df1.shape[0],
                  'Unique': df1.nunique()})
df2['Missing_%']= (df2.Missing/df2.Size)*100
df2[df2['Missing']>0].sort_values(by=['Missing_%'], ascending=False)

Unnamed: 0,Type,Missing,Size,Unique,Missing_%
PoolQC,object,1449,1456,3,99.519231
MiscFeature,object,1402,1456,4,96.291209
Alley,object,1366,1456,2,93.818681
Fence,object,1176,1456,4,80.769231
FireplaceQu,object,689,1456,5,47.321429
LotFrontage,float64,259,1456,110,17.788462
GarageType,object,81,1456,6,5.563187
GarageYrBlt,float64,81,1456,97,5.563187
GarageFinish,object,81,1456,3,5.563187
GarageQual,object,81,1456,5,5.563187


In [222]:
df1['PoolQC']=df1['PoolQC'].fillna('NA')
df1['MiscFeature']=df1['MiscFeature'].fillna('NA')
df1['Alley']=df1['Alley'].fillna('NA')
df1['Fence']=df1['Fence'].fillna('NA')
df1['FireplaceQu']=df1['FireplaceQu'].fillna('NA')
df1['GarageType']=df1['GarageType'].fillna('NA')
df1['GarageFinish']=df1['GarageFinish'].fillna('NA')
df1['BsmtCond']=df1['BsmtCond'].fillna('NA')
df1['BsmtExposure']=df1['BsmtExposure'].fillna('NA')
df1['BsmtQual']=df1['BsmtQual'].fillna('NA')
df1['BsmtFinType2']=df1['BsmtFinType2'].fillna('NA')
df1['Electrical']=df1['Electrical'].fillna(df1['Electrical'].mode()[0])
df1['GarageCond']=df1['GarageCond'].fillna('NA')
df1['GarageQual']=df1['GarageQual'].fillna('NA')
df1['BsmtFinType1']=df1['BsmtFinType1'].fillna('NA')
df1['MasVnrType']=df1['MasVnrType'].fillna('None')
df1['MSZoning']=df1['MSZoning'].fillna('RL')
df1['Functional']=df1['Functional'].fillna('Typ')
df1['Utilities']=df1['Utilities'].fillna('AllPub')
df1['KitchenQual']=df1['KitchenQual'].fillna('TA')
df1['Exterior2nd']=df1['Exterior2nd'].fillna('VinylSd')
df1['Exterior1st']=df1['Exterior1st'].fillna('VinylSd')
df1['SaleType']=df1['SaleType'].fillna('WD')
df1['LotFrontage'] = df1['LotFrontage'].fillna(df1['LotFrontage'].median())
df1['GarageYrBlt'] = df1['GarageYrBlt'].fillna(df1['GarageYrBlt'].median())
df1['MasVnrArea'] = df1['MasVnrArea'].fillna(df1['MasVnrArea'].median())

In [223]:
df1['MSSubClass']=df1['MSSubClass'].astype(object)
df1['MoSold']=df1['MoSold'].astype(object)

In [224]:
'''
for i in df2[df2['Missing']>0].index:
    if i=='SalePrice':
        continue
    else:
        df1[i]=df1[i].fillna(df1[i].median())
        '''

"\nfor i in df2[df2['Missing']>0].index:\n    if i=='SalePrice':\n        continue\n    else:\n        df1[i]=df1[i].fillna(df1[i].median())\n        "

In [225]:
df2=pd.DataFrame({'Type': df1.dtypes,
                  'Missing': df1.isna().sum(),
                  'Size':df1.shape[0],
                  'Unique': df1.nunique()})
df2['Missing_%']= (df2.Missing/df2.Size)*100
df2[df2['Missing']>0].sort_values(by=['Missing_%'], ascending=False)

Unnamed: 0,Type,Missing,Size,Unique,Missing_%


In [226]:
categorical=['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'LandSlope', 'LandContour',
             'Condition1','Condition2','BldgType', 'HouseStyle', 'YearBuilt','YearBuilt',
             'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd','RoofMatl',
            'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'Fence',
            'MiscFeature', 'MoSold' ,'YrSold', 'SaleType', 'PavedDrive','Alley','SaleCondition' ]

In [227]:
ordinal=['LotShape', 'ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
         'BsmtFinType1', 'BsmtFinType2','HeatingQC','KitchenQual', 'Functional', 
        'FireplaceQu', 'GarageFinish','GarageQual', 'GarageCond', ]

In [228]:
numerical=['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
          'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', 
          '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath','FullBath',
          'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
           'GarageCars', 'GarageArea', 'WoodDeckSF','OpenPorchSF', 'EnclosedPorch',
          'ScreenPorch', 'MiscVal', 'GarageYrBlt', 'Actual_SalePrice' ]

In [229]:
ex_qu= { 'Po':0, 'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3 }
ex_cond={ 'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4 }
Bsmt_Qual={"NA": 0, 'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5 }
BsmtFinType1={ "NA": 0, 'Unf':0, 'LwQ': 1, 'Rec': 2, 'BLQ': 3, 'ALQ': 4, 'GLQ':5 }
Bsmt_Exposure={ "NA":0, "No": 0, "Mn": 1, "Av": 2, "Gd": 3}
garage_fin={'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
garage_qu= { "NA": 0, 'Po':0, 'Fa':1, 'TA': 2, 'Gd': 3, 'Ex': 4  }
LotShape={"IR3": 0, 'IR2':0, 'IR1': 1, 'Reg': 2}
Functional={"Sal": 0, 'Sev':1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2':5, 'Min1':6, 'Typ':7}

data=df1.replace({"LotShape": LotShape,
                    "ExterQual": ex_qu,
                   "ExterCond": ex_cond,
                   "BsmtQual": Bsmt_Qual,
                   "BsmtCond": Bsmt_Qual,
                   "BsmtExposure": Bsmt_Exposure, 
                   "BsmtFinType1": BsmtFinType1, 
                   "BsmtFinType2": BsmtFinType1,
                   "HeatingQC": ex_qu,
                   "KitchenQual": ex_qu,
                   "Functional": Functional,
                    "GarageFinish": garage_fin,
                    "GarageQual": garage_qu,
                    "GarageCond": garage_qu,
                    "FireplaceQu": garage_qu})

In [230]:
X1=data[ordinal]
X2=pd.get_dummies(data[categorical], drop_first=True)
X3=data[numerical]
X3

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,MiscVal,GarageYrBlt,Actual_SalePrice
0,65.0,8450,7,5,196.0,706,0,150,856,856,...,0,2,548,0,61,0,0,0,2003.0,208500
1,80.0,9600,6,8,0.0,978,0,284,1262,1262,...,1,2,460,298,0,0,0,0,1976.0,181500
2,68.0,11250,7,5,162.0,486,0,434,920,920,...,1,2,608,0,42,0,0,0,2001.0,223500
3,60.0,9550,7,5,0.0,216,0,540,756,961,...,1,3,642,0,35,272,0,0,1998.0,140000
4,84.0,14260,8,5,350.0,655,0,490,1145,1145,...,1,3,836,192,84,0,0,0,2000.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,62.0,7917,6,5,0.0,0,0,953,953,953,...,1,2,460,0,40,0,0,0,1999.0,175000
1452,85.0,13175,6,6,119.0,790,163,589,1542,2073,...,2,2,500,349,0,0,0,0,1978.0,210000
1453,66.0,9042,7,9,0.0,275,0,877,1152,1188,...,2,1,252,0,60,0,0,2500,1941.0,266500
1454,68.0,9717,5,6,0.0,49,1029,0,1078,1078,...,0,1,240,366,0,112,0,0,1950.0,142125


In [231]:
X3_train=X3.iloc[:len(train),:]
X3_test=X3.iloc[len(train):,:]

In [232]:
skewed_columns=[]
for i in X3_train.columns:
    if abs(X3_train[i].skew())> 0.5:
        skewed_columns.append(i)

from scipy.special import boxcox1p
lam=0.15
for i in skewed_columns:
    X3_train[i]= boxcox1p(X3_train[i],lam)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [233]:
from scipy.special import boxcox1p
lam=0.15
for i in skewed_columns:
    X3_test[i]= boxcox1p(X3_test[i],lam)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [234]:
X3=pd.concat([X3_train,X3_test], axis=0)

In [235]:
dataset=(pd.concat([X2, X1, X3], axis=1))
dataset.shape

(1456, 225)

In [236]:
Y=test['Actual_SalePrice']
Y = test.iloc[:len(dataset),:]
Y = Y["Actual_SalePrice"].values
X=dataset.iloc[:len(test),:].values
#X=dataset.values
#Y=test['Actual_SalePrice'].values
#y=Y.reshape(-1,1)
#Y=np.log1p(Y)
print(X.shape)
print(Y.shape)

(1456, 225)
(1456,)


In [237]:
test_dataset=dataset.iloc[:,:]

In [238]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state=1)

In [239]:
from sklearn.preprocessing import RobustScaler
sc= RobustScaler()
X_train[:,(len(X1.columns)+len(X2.columns)):]= sc.fit_transform(X_train[:, (len(X1.columns)+len(X2.columns)):])
X_test[:,(len(X1.columns)+len(X2.columns)):]= sc.transform(X_test[:, (len(X1.columns)+len(X2.columns)):])

In [240]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [241]:
def custom_function(Y_train, Y_pred):
    RMSE=(round(mean_squared_error((Y_train),(Y_pred), squared=False), 4))
    return RMSE

scorer=make_scorer(custom_function,greater_is_better=False)

In [242]:
import xgboost as xgb
reg = xgb.XGBRegressor()
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)
print(reg.score(X_train,Y_train))
print("RMSE: ",round(mean_squared_error((Y_test),(Y_pred), squared=False), 4))
#print("MSE: ", round(mean_absolute_error(np.exp(Y_test), np.exp(Y_pred)), 4))


0.5045014038289568
RMSE:  86275.6113
