# Third Notebook on a House Price Prediction Model  
# Supervised Machine Learning  

## 1. Importing modules and .csv, Preparation: 

In [19]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
# from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

housing_classific = pd.read_csv(r'housing-classification-iter-0-2.csv')


In [20]:
# First split before Imputing, preventing data leakage:
X = housing_classific
y = housing_classific.pop('Expensive')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337) # there is only one true random_state.

This time, we want to keep the data of the rows missing a value within the LotFrontage column.  
Therefore we need to use an imputer, checking documentation leads to KNNImputer (k-Nearest Neighbors).

## 2. Applying KNN Imputer:

In [21]:
# Imputer setup:
frontage_imputer = KNNImputer(missing_values=np.nan, n_neighbors=10,add_indicator=True)
# Fit on train set:
frontage_imputer.fit(X_train)
# Transforming both, train and test set:
X_imputed_train = frontage_imputer.transform(X_train)
X_imputed_test = frontage_imputer.transform(X_test)

In [22]:
# Planting another tree:
imputed_tree = DecisionTreeClassifier(max_depth=12,
                                        min_samples_leaf=2)
imputed_tree.fit(X=X_imputed_train,
                y= y_train)
y_imputed_pred = imputed_tree.predict(X_imputed_train)
accuracy_score(y_true = y_train,
                y_pred = y_imputed_pred)




0.973458904109589

In [23]:
# And another tree, this time for testing:
imputed_tree.fit(X=X_imputed_test,
                y= y_test)
y_imputed_pred2 = imputed_tree.predict(X_imputed_test)
accuracy_score(y_true = y_test,
                y_pred = y_imputed_pred2)

0.9794520547945206

## 3. Building a pipeline:

### 3.1. Initial setups:

In [24]:
# As above, but using more elegant variable names:
imputer = KNNImputer(missing_values=np.nan, n_neighbors=10,add_indicator=True)

dtreecl = DecisionTreeClassifier(max_depth=8,
                                min_samples_leaf=4)

### 3.2. Pipe Creation:

In [25]:
pipe = make_pipeline(imputer, dtreecl).set_output(transform='pandas')

pipe.fit(X_train, y_train)

In [26]:
pipe2 = make_pipeline(imputer, dtreecl).set_output(transform='pandas')

pipe2.fit(X_test, y_test)

In [27]:
y_pipe_train_pred = pipe.predict(X_train)
accuracy_score(y_train,y_pipe_train_pred)

0.8946917808219178

## 4. Applying GridSearchCV: 