# 1. Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
#sklearn:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

housing_classification = pd.read_csv(r'cleaned-housing-classification.csv')
test_housing_classification = pd.read_csv(r'cleaned-test-housing-classification.csv')

# 2. Tools

### 2.1 Train Test Split

In [2]:
# expensive_houses = housing_classification.pop('Expensive')
# X = housing_classification.copy()
# y = expensive_houses.copy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

X_train, X_test, y_train, y_test = train_test_split(housing_classification.drop('Expensive', axis= 1),
                                                    housing_classification['Expensive'],
                                                    test_size=0.2,
                                                    random_state=1337)

### 2.2 Column transforming
As in Analysis - Object columns for OneHotEncoder: MSZoning, Neighborhood, Condition1, Condition2, CentralAir, MiscVal (exterior1st, and 2nd??, MasVnrType)

In [3]:
# # Numerical columns
#     numeric_cols = X_train.select_dtypes(include='number').columns
#     numeric_transformer = KNNImputer(missing_values=np.nan)

# # Object columns
# categorical_cols = X_train.select_dtypes(include='object').columns
# categorical_transformer = SimpleImputer(strategy='most_frequent') # test
# # X_train[categorical_cols] = X_train[categorical_cols].fillna(0)
# # X_test[categorical_cols] = X_test[categorical_cols].fillna(0)

# # Columns to be one-hot encoded
# ohe_cols = ['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'CentralAir', 'MiscFeature']
# ohe_transformer = OneHotEncoder(dtype='int',sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6)

# # Columns to be ordinal encoded
# ordinal_cols = categorical_cols.drop(ohe_cols)
# ordinal_transformer = OrdinalEncoder(dtype='int')

# # Column transformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_cols),
#         ('cat', categorical_transformer, categorical_cols), # test
#         ('ohe', ohe_transformer, ohe_cols),
#         ('ord', ordinal_transformer, ordinal_cols)
#     ])

In [None]:
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6) # handle_unknown is set to ignore the values that are not familiar
)

preprocessor = make_column_transformer(
        (numeric_pipe, make_column_selector(dtype_include='number')),
        (categoric_pipe, make_column_selector(dtype_include='object'))
)
preprocessor



from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier()).set_output(transform='pandas')


### 2.3 Modifying values

In [4]:
# Fill NA with 0 in objects
# X_train[categorical_cols] = X_train[categorical_cols].fillna(0) # put in comments for troubleshooting
# X_test[categorical_cols] = X_test[categorical_cols].fillna(0)

# Set scaler for numericals # discarded for now
scaler = QuantileTransformer()


## 3. Pipeline and GridSearchCV

### 3.1 Setting up pipeline

In [5]:
pipeline = make_pipeline([
                    ('preprocessor', preprocessor),
                    ('scaler', scaler),
                    ('model', RandomForestClassifier())]).set_output(transform='pandas')
                    
# params = pipeline.get_params()
# print(params)

In [6]:
# X_train = np.array(X_train)
# y_train = np.array(y_train)
# pipeline.fit(X_train, y_train)

In [7]:
# pipeline.get_params()['list'][2][1].get_params()

### 3.2. GridSearch

In [8]:
# y_train = np.array(y_train) # another shot to troupleshoot

In [9]:
param_grid = {
    'scaler__n_quantiles': [10, 25, 100, 200],
    'model__n_estimators': [10, 75, 250],
    'model__max_depth': [2, 10, 50]
}

In [10]:
grid_search = GridSearchCV(pipeline, param_grid, scoring=make_scorer(accuracy_score))

In [11]:
grid_search.fit(X_train, y_train)

ValueError: Invalid parameter 'model' for estimator Pipeline(steps=[('list',
                 [('preprocessor',
                   ColumnTransformer(transformers=[('num', KNNImputer(),
                                                    Index(['Id', 'LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr',
       'Fireplaces', 'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF...
       'FireplaceQu', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence'],
      dtype='object'))])),
                  ('model', RandomForestClassifier())])]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [None]:
# print(grid_search.best_params_)