## Housing project_Classification
 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn import set_config
import pandas as pd

path = r"C:\Users\Aida\OneDrive\Documents\Bootcamp_WBS\Primer\Python\WBS_DATA\8_SUP_ML\Data\housing_iteration_4_classification.csv"
data = pd.read_csv(path)
data.head(10)


# X: All columns except 'Expensive' (features)
X = data.drop(columns=['Expensive'])

# y: The 'Expensive' column (target)
y = data['Expensive']

# data splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [None]:
# 0. Set the config so that we can view our preprocessor, and to transform output from numpy arrays to pandas dataframes
set_config(display="diagram")
set_config(transform_output="pandas")

# 1. defining categorical & numerical columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# 2. numerical pipeline
numeric_pipe = make_pipeline(
    SimpleImputer(fill_value=None))

# 3. categorical pipeline

# # 3.1 defining ordinal & onehot columns
# .get_indexer() get's the index to solve the problem described above about losing column names
ordinal_cols = ["Street", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond","BsmtExposure", "KitchenQual", "FireplaceQu"]
onehot_cols = ["MSZoning", "Condition1", "Heating", "CentralAir", "Foundation"]

# # 3.2. defining the categorical encoder

# # # 3.2.1. we manually establish the order of the categories for our ordinal feature (Cabin), including "N_A"
# Define the category orders for ordinal features
ordinal_categories = [
    ['N_A', 'Grvl', 'Pave'],  # Street
    ['N_A','Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterQual
    ['N_A','Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterCond
    ['N_A','NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtQual
    ['N_A','NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtCond
    ['N_A','NA', 'No', 'Mn', 'Av', 'Gd'],  # BsmtExposure
    ['N_A','Po', 'Fa', 'TA', 'Gd', 'Ex'],  # KitchenQual
    ['N_A','NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']  # FireplaceQu
]   
# # # 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), onehot_cols)
    ]
)


# # 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)
full_preprocessing

In [4]:
from sklearn.model_selection import GridSearchCV

# full pipeline: preprocessor + model
full_pipeline = make_pipeline(full_preprocessing,
                              DecisionTreeClassifier())

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median", "constant"],
    "columntransformer__num_pipe__simpleimputer__fill_value": [10, 0, -1],
    "decisiontreeclassifier__max_depth": range(2, 20, 2),
    "decisiontreeclassifier__min_samples_leaf": range(2, 15, 2)
}

# define GridSearchCV
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

print("Best Parameters:", search.best_params_)
print("Best Score:", search.best_score_)

Fitting 5 folds for each of 567 candidates, totalling 2835 fits
Best Parameters: {'columntransformer__num_pipe__simpleimputer__fill_value': 0, 'columntransformer__num_pipe__simpleimputer__strategy': 'median', 'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__min_samples_leaf': 4}
Best Score: 0.9255053006125967


In [6]:
# Make predictions on the training data
y_train_pred = search.predict(X_train)

# Calculate accuracy score
accuracy = accuracy_score(y_train, y_train_pred)

print(f"Accuracy on training data: {accuracy:.4f}")

Accuracy on training data: 0.9503


In [7]:
# Make predictions on the test data
y_test_pred = search.predict(X_test)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Accuracy on test data: {test_accuracy:.4f}")


Accuracy on test data: 0.9349
