In [164]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 80)

In [165]:
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

files_id = {
    'housing_data':"1kX_jcLeBpBGvTo8FXDeU2MK-aw1a0voU", #iteration7 data
    'test_housing_data':"1CMsAWhWKWBjI6DDEHtcYmRVZRazfE9bo", #test data for housing
    'ids_com':"10gwiL49calkj-xbx-3rQEK4H2zoJcU11" #ID for the commiting of the project

}

housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")
test_housing_data = pd.read_csv(gd_path(files_id['test_housing_data']), sep=",")
ids_com = pd.read_csv(gd_path(files_id['ids_com']), sep=",")
df = housing_data

# Pre-Processing Pipe

## Ordinary columns

This part was done by hands. Many Bothans died to bring us this information.

In [166]:
from sklearn.model_selection import train_test_split

# X and y creation
y = df.pop("SalePrice")

# Feature Engineering
X = df

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [167]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

#  numerical pipeline
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler(),
    VarianceThreshold(threshold=0.02)) # maybe remove it
    


# categorical pipeline
# defining ordinal & onehot columns

ordinal_cols = X_cat.columns.get_indexer(['LandContour', 'LandSlope',
                     'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                     'HeatingQC', 'KitchenQual', 'Functional',
                     'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PoolQC']) 

onehot_cols = X_cat.columns.get_indexer(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 
                                         'Foundation', 'Alley', 'LotShape', 'Utilities', 'LotConfig', 
                                         'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 
                                         'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                                         'Electrical', 'GarageType', 'PavedDrive', 'Fence', 'MiscFeature',
                                         'SaleType', 'SaleCondition'])



# defining the categorical encoder
# we manually establish the order of the categories for our ordinal feature (Cabin), including "N_A"
LandContour = ['Lvl', 'Bnk', 'HLS', 'Low']
LandSlope = ['Gtl', 'Mod', 'Sev']
ExterQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ExterCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
BsmtQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
BsmtCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
BsmtExposure = ['Gd', 'Av', 'Mn', 'No', 'N_A']
BsmtFinType1 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A']
BsmtFinType2 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A']
HeatingQC = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
KitchenQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
Functional = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
FireplaceQu = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
GarageFinish = ['Fin', 'RFn', 'Unf', 'N_A']
GarageQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
GarageCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
PoolQC = ['Ex', 'Gd', 'TA', 'Fa', 'N_A']


categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=[LandContour,LandSlope,
                                                    ExterQual, ExterCond, BsmtQual, BsmtCond,
                                                    BsmtExposure, BsmtFinType1, BsmtFinType2,
                                                    HeatingQC, KitchenQual, Functional,
                                                    FireplaceQu, GarageFinish, GarageQual, GarageCond,
                                                    PoolQC]), ordinal_cols),

        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols)
    ]
)


categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    categorical_encoder 
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [168]:
pd.DataFrame(categoric_pipe.fit_transform(X_cat).toarray()).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,0.185616,0.062329,1.60411,1.916438,1.510959,2.065068,2.369863,2.45411,4.75274,0.854795,1.488356,0.158219,3.174658,1.284247,2.189726,2.191096,3.986986,0.006849,0.044521,0.010959,0.788356,0.149315,0.032877,0.055479,0.863014,0.005479,0.013014,0.007534,0.017808,0.00137,0.003425,0.000685,0.978082,0.012329,0.004795,0.00137,0.00274,0.00411,0.99589,0.065068,0.934932,0.1,0.434247,0.443151,0.016438,0.00411,0.002055,0.034247,0.937671,0.028082,0.331507,0.028082,0.006849,0.633562,0.999315,0.000685,0.180137,0.064384,0.032192,0.00274,0.720548,0.011644,0.00137,0.010959,0.039726,0.019178,0.10274,0.034932,0.068493,0.05411,0.025342,0.011644,0.033562,0.15411,0.006164,0.05,0.028082,0.05274,0.077397,0.017123,0.050685,0.040411,0.058904,0.017123,0.026027,0.007534,0.00137,0.00411,0.989726,0.000685,0.00137,0.000685,0.000685,0.00137,0.835616,0.021233,0.035616,0.029452,0.078082,0.105479,0.009589,0.49726,0.005479,0.007534,0.304795,0.025342,0.044521,0.008904,0.781507,0.007534,0.19589,0.004795,0.00137,0.000685,0.982192,0.000685,0.000685,0.000685,0.007534,0.003425,0.00411,0.013699,0.000685,0.00137,0.034247,0.000685,0.041781,0.152055,0.000685,0.150685,0.073973,0.00137,0.017123,0.35274,0.141096,0.017808,0.013699,0.002055,0.004795,0.017123,0.000685,0.041096,0.141781,0.006849,0.146575,0.000685,0.09726,0.003425,0.017808,0.345205,0.134932,0.026027,0.010274,0.304795,0.005479,0.591781,0.087671,0.064384,0.018493,0.002055,0.000685,0.000685,0.913699,0.00411,0.59589,0.013014,0.060274,0.006164,0.265068,0.055479,0.061644,0.020548,0.917808,0.040411,0.036986,0.107534,0.007534,0.807534,0.00137,0.963014,0.00137,0.033562,0.000685,0.029452,0.00274,0.00137,0.006164,0.003425,0.003425,0.083562,0.002055,0.867808,0.069178,0.00274,0.008219,0.013699,0.820548,0.085616
std,0.606509,0.276232,0.57428,0.351054,0.876478,0.552159,1.067391,2.107776,0.892332,0.959501,0.66376,0.667698,1.810877,0.892831,0.722898,0.719685,0.204059,0.082505,0.206319,0.104145,0.408614,0.356521,0.178375,0.228992,0.343951,0.073846,0.113372,0.086502,0.132299,0.036999,0.05844,0.026171,0.146465,0.110386,0.0691,0.036999,0.052289,0.063996,0.063996,0.246731,0.246731,0.300103,0.495827,0.496928,0.127198,0.063996,0.045299,0.181924,0.241835,0.165264,0.470916,0.165264,0.082505,0.481996,0.026171,0.026171,0.384433,0.245519,0.17657,0.052289,0.448884,0.107313,0.036999,0.104145,0.195382,0.137198,0.303723,0.183669,0.252677,0.226311,0.157217,0.107313,0.18016,0.361177,0.078298,0.21802,0.165264,0.22359,0.267312,0.129775,0.219429,0.196989,0.235526,0.129775,0.159271,0.086502,0.036999,0.063996,0.100873,0.026171,0.036999,0.026171,0.026171,0.036999,0.37075,0.144209,0.185395,0.169128,0.268393,0.307275,0.097486,0.500164,0.073846,0.086502,0.460478,0.157217,0.206319,0.093973,0.413365,0.086502,0.397021,0.0691,0.036999,0.026171,0.132299,0.026171,0.026171,0.026171,0.086502,0.05844,0.063996,0.116277,0.026171,0.036999,0.181924,0.026171,0.200157,0.359197,0.026171,0.357864,0.261816,0.036999,0.129775,0.477986,0.34824,0.132299,0.116277,0.045299,0.0691,0.129775,0.026171,0.19858,0.348945,0.082505,0.353803,0.026171,0.296413,0.05844,0.132299,0.475598,0.341767,0.159271,0.100873,0.460478,0.073846,0.491673,0.282913,0.245519,0.134772,0.045299,0.026171,0.026171,0.280905,0.063996,0.490887,0.113372,0.238075,0.078298,0.441521,0.228992,0.24059,0.141914,0.274751,0.196989,0.188793,0.309897,0.086502,0.394372,0.036999,0.188793,0.036999,0.18016,0.026171,0.169128,0.052289,0.036999,0.078298,0.05844,0.05844,0.276824,0.045299,0.338815,0.253844,0.052289,0.090317,0.116277,0.383862,0.279893
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,5.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,2.0,2.0,1.0,2.0,3.0,2.0,5.0,0.0,2.0,0.0,3.0,1.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,2.0,2.0,2.0,2.0,3.0,5.0,5.0,2.0,2.0,0.0,5.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
max,3.0,2.0,3.0,4.0,5.0,5.0,4.0,6.0,6.0,4.0,3.0,6.0,5.0,3.0,5.0,5.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Models

## Decisiontree

In [169]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean"],
    "decisiontreeclassifier__max_depth": [10],
    "decisiontreeclassifier__min_samples_leaf": [11]
}


search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

scores = {"dtree" : search.best_score_}
print("Best parameters: ", search.best_params_)

scores

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best parameters:  {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'decisiontreeclassifier__max_depth': 10, 'decisiontreeclassifier__min_samples_leaf': 11}


{'dtree': 0.017989068632845457}

## Decisiontree - Analysis

In [170]:
# Get predictions for the training and testing datasets
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", round(accuracy_train,3))
print("Testing Accuracy:", round(accuracy_test,3),"\n")


dtree_r2 = r2_score(y_true = y_test,
                    y_pred = y_test_pred)

print("R-squared:", round(dtree_r2,3))

dtree_r2 = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(dtree_r2,3))

Training Accuracy: 0.152
Testing Accuracy: 0.017 

R-squared: 0.454
MAPE: 0.191


## KNN


In [171]:
# Modeling Pipe - 2

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


full_pipeline = make_pipeline(preprocessor,
                              KNeighborsClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean"],
    "kneighborsclassifier__n_neighbors": [10]
}

search2 = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1)

search2.fit(X_train, y_train)

scores2 = {"knn" : search2.best_score_}
print("Best parameters: ", search2.best_params_)

scores2


Fitting 10 folds for each of 1 candidates, totalling 10 fits




Best parameters:  {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'kneighborsclassifier__n_neighbors': 10}


{'knn': 0.004273504273504274}

## KNN - Analysis

In [172]:
# Get predictions for the training and testing datasets
y_train_pred = search2.predict(X_train)
y_test_pred = search2.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", round(accuracy_train,3))
print("Testing Accuracy:", round(accuracy_test,3),"\n")


dtree_r2 = r2_score(y_true = y_test,
                    y_pred = y_test_pred)

print("R-squared:", round(dtree_r2,3))

dtree_r2 = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(dtree_r2,3))

Training Accuracy: 0.11
Testing Accuracy: 0.007 

R-squared: 0.108
MAPE: 0.236


## RandomForest

In [173]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor,
                              RandomForestClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["median"],
    "randomforestclassifier__n_estimators": [500],
    "randomforestclassifier__max_features": ['sqrt'],
    'randomforestclassifier__max_depth' : [4],
    'randomforestclassifier__criterion' :['entropy']
}


'''param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["median"],
    "randomforestclassifier__n_estimators": [150],
    "randomforestclassifier__max_depth": [8],
    "randomforestclassifier__max_features": ["sqrt"],
    'criterion' :['gini', 'entropy']
}
'''
# need to be rearange 
search3 = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search3.fit(X_train, y_train)

scores3 = {"rf" : search3.best_score_}
print("Best parameters: ", search3.best_params_)
scores3

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best parameters:  {'columntransformer__num_pipe__simpleimputer__strategy': 'median', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 4, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__n_estimators': 500}


{'rf': 0.016272330435420566}

## RandomForest - Analysis

In [174]:
# Get predictions for the training and testing datasets
y_train_pred = search3.predict(X_train)
y_test_pred = search3.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", round(accuracy_train,3))
print("Testing Accuracy:", round(accuracy_test,3),"\n")


dtree_r2 = r2_score(y_true = y_test,
                    y_pred = y_test_pred)

print("R-squared:", round(dtree_r2,3))

dtree_r2 = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(dtree_r2,3))

Training Accuracy: 0.486
Testing Accuracy: 0.017 

R-squared: 0.595
MAPE: 0.159


### SGDRegressor

In [175]:
from sklearn.linear_model import LinearRegression, SGDRegressor

sgd_pipeline = make_pipeline(preprocessor,
                            SGDRegressor())

sgd_pipeline.fit(X_train, y_train)

sgd_predictions = sgd_pipeline.predict(X_test)

SGD_mape = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(SGD_mape,3))

SGR_r2 = r2_score(y_true = y_test,
                    y_pred = sgd_predictions)

print("R-squared:", round(SGR_r2, 3))

MAPE: 0.159
R-squared: 0.836


### LinearRegression

In [176]:
# Scale the data! 

lr_pipeline = make_pipeline(preprocessor,
                            LinearRegression())

lr_pipeline.fit(X_train, y_train)

lr_predictions = lr_pipeline.predict(X_test)

lr_mape = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(lr_mape,3))

lr_r2 = r2_score(y_true = y_test,
                    y_pred = lr_predictions)

print("R-squared:", round(lr_r2,3))

MAPE: 0.159
R-squared: 0.866


# Iteration 0

In [177]:
y_test_pred_tree = search.predict(X_test)
y_test_pred_knn = search2.predict(X_test)
y_test_pred_rf = search3.predict(X_test)
y_test_pred_SGDR = sgd_pipeline.predict(X_test)
y_test_pred_lr = lr_pipeline.predict(X_test)


baseline_tree_r2 = r2_score(y_test, y_test_pred_tree)
baseline_knn_r2 = r2_score(y_test, y_test_pred_knn)
baseline_rf = r2_score(y_test, y_test_pred_rf)
baseline_SGDR = r2_score(y_test, y_test_pred_SGDR)
baseline_lr = r2_score(y_test, y_test_pred_lr)


performances = pd.DataFrame({'decision_tree': round(baseline_tree_r2,3),
                             'knn': round(baseline_knn_r2,3), 
                             'RF': round(baseline_rf,3),
                             'SGDR': round(baseline_SGDR, 2),
                            'LR': round(baseline_lr,3)},
                            index=['baseline'])

performances

Unnamed: 0,decision_tree,knn,RF,SGDR,LR
baseline,0.454,0.108,0.595,0.84,0.866


## Variance Threshold

In [178]:
X_train_num = X_train.select_dtypes(include="number").copy()


range_var_df = (pd.DataFrame({
                'Range': X_train_num.max() - X_train_num.min(),
                'Variance': round(X_train_num.var(),2)})
                
                .sort_values(by='Variance'))

range_var_df

Unnamed: 0,Range,Variance
BsmtHalfBath,2.0,0.05
KitchenAbvGr,1.0,0.05
HalfBath,2.0,0.25
BsmtFullBath,3.0,0.27
FullBath,3.0,0.3
Fireplaces,3.0,0.41
GarageCars,4.0,0.56
BedroomAbvGr,8.0,0.69
OverallCond,8.0,1.28
YrSold,4.0,1.78


Drop: KitchenAbvGr, BsmtHalfBath, HalfBath, BsmtFullBath, FullBath, Fireplaces, GarageCars, BedroomAbvGr with Variance under 1   

In [179]:
# Initialize the scaler.
my_scaler = MinMaxScaler().set_output(transform="pandas")

# Fit the scaler to X_train and fit_transform the values.
X_scaled = my_scaler.fit_transform(X_train_num)

(
  pd.DataFrame({
  'Range': X_scaled.max() - X_scaled.min(),
  'Variance': X_scaled.var()})
  .sort_values(by='Variance')
)

Unnamed: 0,Range,Variance
MiscVal,1.0,0.001249
LotArea,1.0,0.001935
3SsnPorch,1.0,0.003677
PoolArea,1.0,0.003701
TotalBsmtSF,1.0,0.00538
LotFrontage,1.0,0.006315
BsmtFinSF1,1.0,0.006809
1stFlrSF,1.0,0.008265
LowQualFinSF,1.0,0.008288
GrLivArea,1.0,0.010239


In [180]:
selector = VarianceThreshold(threshold=0.02)
X_var = selector.fit_transform(X_scaled)

print("shape before:", X_scaled.shape)
print("shape after:", X_var.shape)

shape before: (1168, 37)
shape after: (1168, 19)


In [181]:
X_test_num = X_test.select_dtypes(include="number").copy()

# Scale the test set, transform for train 
X_test_scaled = my_scaler.transform(X_test_num) 

# Apply the variance threshold to the scaled test set
X_test_var = selector.transform(X_test_scaled)


In [182]:
y_test_pred_tree = search.predict(X_test_var)
y_test_pred_knn = search2.predict(X_test_var)
y_test_pred_rf = search3.predict(X_test_var)
y_test_pred_SGDR = sgd_pipeline.predict(X_test_var)
y_test_pred_lr = lr_pipeline.predict(X_test_var)


baseline_tree_r2 = r2_score(y_test, y_test_pred_tree)
baseline_knn_r2 = r2_score(y_test, y_test_pred_knn)
baseline_rf = r2_score(y_test, y_test_pred_rf)
baseline_SGDR = r2_score(y_test, y_test_pred_SGDR)
baseline_lr = r2_score(y_test, y_test_pred_lr)


performances.loc["varThreshold_0_02", 'decision_tree'] = round(baseline_tree_r2,3)
performances.loc["varThreshold_0_02", 'knn'] = round(baseline_knn_r2,3) 
performances.loc["varThreshold_0_02", 'RF'] = round(baseline_rf,3)
performances.loc["varThreshold_0_02", 'SGDR'] = round(baseline_SGDR, 2)
performances.loc["varThreshold_0_02", 'LR'] = round(baseline_lr,3)


performances

ValueError: X has 19 features, but ColumnTransformer is expecting 80 features as input.

### Scaling the data

# Downloand

In [None]:
X_sumbmition = housing_data[0:1459]
ids_com['Expensive'] = search4.predict(X_sumbmition) # only cange the piplene! 
ids_com.to_csv(r'submission_9.csv', index=False) #only cahnge ist 

NameError: name 'search4' is not defined