In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression


In [None]:
url = "https://drive.google.com/file/d/1t3Rxpb5Hr0baI1KZWSvrAujv68A_CdnK/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = df = pd.read_csv(path)

In [None]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# ***SPLIT DATAS***

In [None]:
# X and y creation
X = data.copy()
X.pop('Id')
y = X.pop("SalePrice")
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ***PREPROCESSING***

In [None]:
# 1. defining categorical & ordinal columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# 2. numerical pipeline
numeric_pipe = make_pipeline(SimpleImputer())

# 3. categorical pipeline
    # # 3.1 defining ordinal & onehot columns

ordinal_col_names = ['ExterQual', 
                     'ExterCond', 
                     'BsmtQual', 
                     'BsmtCond', 
                     'BsmtExposure',
                     'BsmtFinType1', 
                     'KitchenQual', 
                     'FireplaceQu', 
                     'LotShape',
                     'BsmtFinType2', 
                     'HeatingQC', 
                     'GarageFinish', 
                     'GarageQual', 
                     'GarageCond',
                     'PoolQC', 
                     'Fence']

ordinal_cols = X_cat.columns.get_indexer(ordinal_col_names)
ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))

X_cat_ordinal = X_cat.columns[ordinal_cols]
X_cat_ohe = X_cat.columns[ohe_cols]

    ## 3.2 explicitly determine categories for ordinal encoding including "N_A"
ExterQual_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["N_A",'Reg', 'IR1', 'IR2', 'IR3']
BsmtFinType2_cats = ['N_A','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
HeatingQC_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['N_A','Unf','RFn','Fin']
GarageQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ["N_A",'NA','MnWw','GdWo','MnPrv','GdPrv']

cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats, 
            LotShape_cats,BsmtFinType2_cats,HeatingQC_cats,GarageFinish_cats,GarageQual_cats,
            GarageCond_cats,PoolQC_cats,Fence_cats]

        ### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=cats_ord), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

    ## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)


#***PCA***

In [None]:
from sklearn.decomposition import PCA

In [None]:
import time


In [None]:
from sklearn.tree import DecisionTreeRegressor

start_time = time.time()

DT_pipeline = make_pipeline(full_preprocessing, 
                             StandardScaler(),
                             PCA(n_components=0.95),
                            DecisionTreeRegressor())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean","constant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "decisiontreeregressor__max_depth": range(3,15,3),
    "decisiontreeregressor__min_samples_split" : range(50,200,30)
}

DT_search = RandomizedSearchCV(DT_pipeline,
                      param_grid,
                      cv=10,
                      scoring="neg_mean_absolute_error",
                      verbose=1, 
                      n_iter=100)

DT_search.fit(X_train, y_train)


end_time = time.time()

time_taken_95_pca = end_time - start_time
print(DT_search.best_score_)

print(f"Time taken: {time_taken_95_pca} seconds")


Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Time taken: 197.48345065116882 seconds
Accuracy -2440750.8%
