In [614]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor


In [615]:
# Define a function to generate Google Drive shareable download links
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Dictionary containing Google Drive file IDs for different data files
files_id = {
    'housing_data': "1kX_jcLeBpBGvTo8FXDeU2MK-aw1a0voU",  # ID for iteration7 data
    'test_housing_data': "1CMsAWhWKWBjI6DDEHtcYmRVZRazfE9bo",  # ID for test data for housing
    'ids_com': "10gwiL49calkj-xbx-3rQEK4H2zoJcU11"  # ID for committing the project
}

# Load the 'housing_data' CSV file using the generated link
housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")

# Load the 'test_housing_data' CSV file using the generated link
test_housing_data = pd.read_csv(gd_path(files_id['test_housing_data']), sep=",")

# Load the 'ids_com' CSV file using the generated link
ids_com = pd.read_csv(gd_path(files_id['ids_com']), sep=",")

# Create a DataFrame 'df' and assign it the 'housing_data' for further processing
df = housing_data

# Pre-Processing Pipe

## Split

In [616]:
from sklearn.model_selection import train_test_split

# X and y creation
y = df.pop("SalePrice")

# Feature Engineering
X = df

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Handling data types

In [651]:
# Separate categorical and numerical features from X
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# Define a pipeline for numerical features
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean")
)

# Define columns that are treated as ordinal
ordinal_cols = X_cat.columns.get_indexer(['LandContour', 'LandSlope',
                     'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                     'HeatingQC', 'KitchenQual', 'Functional',
                     'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PoolQC'])

# Define columns that are one-hot encoded
onehot_cols = X_cat.columns.get_indexer(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 
                                         'Foundation', 'Alley', 'LotShape', 'Utilities', 'LotConfig', 
                                         'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 
                                         'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                                         'Electrical', 'GarageType', 'PavedDrive', 'Fence', 'MiscFeature',
                                         'SaleType', 'SaleCondition'])



# Define the categories for ordinal features
# These categories are manually established, including "N_A"
ordinal_categories = [['Lvl', 'Bnk', 'HLS', 'Low', 'N_A'],
                      ['Gtl', 'Mod', 'Sev', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Gd', 'Av', 'Mn', 'No', 'N_A'],
                      ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A'],
                      ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Fin', 'RFn', 'Unf', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'N_A']]

# Create a categorical encoder using ColumnTransformer
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols)
    ]
)

# Create pipelines for numerical and categorical features

# Pipeline for numerical features
numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),          # Impute missing values with the mean
    SelectKBest(score_func=f_regression, k=15),  # Select the top 15 features using f_regression
    MinMaxScaler(),                         # Scale features to a specified range (0 to 1 by default)
    VarianceThreshold(threshold=0.005)      # Remove low-variance features
)

# Pipeline for categorical features
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),  # Impute missing values with 'N_A'
    categorical_encoder  # Apply the categorical encoder created above
)

# Create a preprocessor that applies the appropriate pipeline to each feature type using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_train.columns),  # Apply the numeric pipeline to numerical features
        ("cat_pipe", categoric_pipe, X_train.columns)    # Apply the categorical pipeline to all features
    ]
)

# Models

## Decisiontree

In [None]:
# Create a full pipeline that includes preprocessing steps and a Decision Tree Classifier
full_pipeline = make_pipeline(preprocessor, DecisionTreeClassifier())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean"],  # Impute missing values with the mean
    "decisiontreeclassifier__max_depth": [18],  # Maximum depth of the decision tree
    "decisiontreeclassifier__min_samples_leaf": [10]  # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object to search for the best hyperparameters
search = GridSearchCV(full_pipeline, param_grid, cv=5, verbose=1)

# Fit the GridSearchCV object on the training data
search.fit(X_train, y_train)

# Get the best cross-validation score from the grid search for Decision Tree Classifier
best_score_dtree = search.best_score_

# Store the best score in a dictionary
scores = {"dtree": best_score_dtree}

# Print the best parameters and the corresponding score
print("Best parameters: ", search.best_params_)
print("Best score:", best_score_dtree)

## Decisiontree - Analysis

In [619]:
# Get predictions for the training and testing datasets using the best Decision Tree Classifier model
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices for training and testing datasets
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

# Print the training and testing accuracies
print("Training Accuracy:", round(accuracy_train, 3))
print("Testing Accuracy:", round(accuracy_test, 3), "\n")

# Calculate R-squared (coefficient of determination) for Decision Tree Classifier predictions
dtree_r2 = r2_score(y_true=y_test, y_pred=y_test_pred)

# Print the R-squared score
print("R-squared:", round(dtree_r2, 3))

# Calculate Mean Absolute Percentage Error (MAPE) for Decision Tree Classifier predictions
dtree_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)

# Print the MAPE score
print("MAPE:", round(dtree_mape, 3))

Training Accuracy: 0.168
Testing Accuracy: 0.01 

R-squared: 0.392
MAPE: 0.177


## KNN


In [None]:
# Create a full pipeline that includes preprocessing steps and a K-Nearest Neighbors (KNN) classifier
full_pipeline = make_pipeline(preprocessor, KNeighborsClassifier())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean"],  # Impute missing values with the mean
    "kneighborsclassifier__n_neighbors": range(2, 29, 2)  # Range of values for the number of neighbors
}

# Create a GridSearchCV object to search for the best hyperparameters
search2 = GridSearchCV(full_pipeline, param_grid, cv=10, verbose=1)

# Fit the GridSearchCV object on the training data
search2.fit(X_train, y_train)

# Get the best cross-validation score from the grid search for KNN classifier
best_score_knn = search2.best_score_

# Store the best score in a dictionary
scores2 = {"knn": best_score_knn}

# Print the best parameters and corresponding score
print("Best parameters: ", search2.best_params_)
print("Best score:", best_score_knn)

## KNN - Analysis

In [621]:
# Get predictions for the training and testing datasets using the best KNN classifier model
y_train_pred = search2.predict(X_train)
y_test_pred = search2.predict(X_test)

# Create confusion matrices for training and testing datasets
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

# Print the training and testing accuracies
print("Training Accuracy:", round(accuracy_train, 3))
print("Testing Accuracy:", round(accuracy_test, 3), "\n")

# Calculate R-squared (coefficient of determination) for KNN classifier predictions
knn_r2 = r2_score(y_true=y_test, y_pred=y_test_pred)

# Print the R-squared score
print("R-squared:", round(knn_r2, 3))

# Calculate Mean Absolute Percentage Error (MAPE) for KNN classifier predictions
knn_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)

# Print the MAPE score
print("MAPE:", round(knn_mape, 3))

Training Accuracy: 0.071
Testing Accuracy: 0.014 

R-squared: 0.149
MAPE: 0.222


## RandomForest

In [None]:
# Create a full pipeline that includes preprocessing steps and a RandomForestRegressor
full_pipeline = make_pipeline(preprocessor, RandomForestRegressor())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'columntransformer__num_pipe__selectkbest__k': [19],  # Number of top features to select
    'columntransformer__num_pipe__simpleimputer__strategy': ['median'],  # Impute missing values with the median
    'columntransformer__num_pipe__variancethreshold__threshold': [0.005],  # Threshold for removing low-variance features
    'randomforestregressor__criterion': ['poisson'],  # Criterion for splitting nodes
    'randomforestregressor__max_depth': [8],  # Maximum depth of the trees
    'randomforestregressor__max_features': ['auto'],  # Maximum number of features to consider for splitting
    'randomforestregressor__n_estimators': [200]  # Number of trees in the forest
}

# Create a GridSearchCV object to search for the best hyperparameters
search3 = GridSearchCV(full_pipeline, param_grid, cv=10, verbose=0)

# Fit the GridSearchCV object on the training data
search3.fit(X_train, y_train)

# Get the best cross-validation score from the grid search for RandomForestRegressor
best_score_rf = search3.best_score_

# Store the best score in a dictionary
scores3 = {"rf": best_score_rf}

# Print the best parameters and corresponding score
print("Best parameters: ", search3.best_params_)
print(scores3)

## RandomForest - Analysis

In [636]:
# Get predictions for the training and testing datasets using the best RandomForestRegressor model
y_train_pred = search3.predict(X_train)
y_test_pred = search3.predict(X_test)

# Calculate R-squared (coefficient of determination) for RandomForestRegressor predictions
rf_r2 = r2_score(y_true=y_test, y_pred=y_test_pred)

# Print the R-squared score
print("R-squared:", round(rf_r2, 3))

# Calculate Mean Absolute Percentage Error (MAPE) for RandomForestRegressor predictions
rf_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)

# Print the MAPE score
print("MAPE:", round(rf_mape, 3))

R-squared: 0.872
MAPE: 0.1


## ExtraTreesRegressors

In [None]:
# Create a full pipeline that includes preprocessing steps and an ExtraTreesRegressor
full_pipeline = make_pipeline(preprocessor, ExtraTreesRegressor())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'columntransformer__num_pipe__simpleimputer__strategy': ['mean'],  # Impute missing values with the mean
    'columntransformer__num_pipe__selectkbest__k': [15],  # Number of top features to select
    'columntransformer__num_pipe__variancethreshold__threshold': [0.005],  # Threshold for removing low-variance features
    'extratreesregressor__n_estimators': [200],  # Number of trees in the forest
    'extratreesregressor__max_depth': [8],  # Maximum depth of the trees
}

# Create a GridSearchCV object to search for the best hyperparameters
ert_search = GridSearchCV(full_pipeline, param_grid, cv=5, verbose=1)

# Fit the GridSearchCV object on the training data
ert_search.fit(X_train, y_train)

# Get the best cross-validation score from the grid search for ExtraTreesRegressor
best_score_ert = ert_search.best_score_

# Store the best score in a dictionary
scores = {"ERT": best_score_ert}

# Print the best parameters and corresponding score
print("Best parameters: ", ert_search.best_params_)
print("Best score:", best_score_ert)

In [652]:
# Create an ExtraTreesRegressor pipeline that includes preprocessing steps
ETR_pipeline = make_pipeline(preprocessor, ExtraTreesRegressor())

# Fit the ExtraTreesRegressor pipeline on the training data
ETR_pipeline.fit(X_train, y_train)

# Make predictions on the testing dataset using the trained ExtraTreesRegressor model
ert_predictions = ETR_pipeline.predict(X_test)

# Calculate Mean Absolute Percentage Error (MAPE) for ExtraTreesRegressor predictions
ERT_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=ert_predictions)

# Print the MAPE score
print("MAPE:", round(ERT_mape, 3))

# Calculate R-squared (coefficient of determination) for ExtraTreesRegressor predictions
ERT_r2 = r2_score(y_true=y_test, y_pred=ert_predictions)

# Print the R-squared score
print("R-squared:", round(ERT_r2, 3))

MAPE: 0.095
R-squared: 0.887


## SGDRegressor

In [624]:
# Create an SGD Regressor pipeline that includes preprocessing steps
sgd_pipeline = make_pipeline(preprocessor, SGDRegressor())

# Fit the SGD Regressor pipeline on the training data
sgd_pipeline.fit(X_train, y_train)

# Make predictions on the testing dataset using the trained SGD Regressor model
sgd_predictions = sgd_pipeline.predict(X_test)

# Calculate Mean Absolute Percentage Error (MAPE) for SGD Regressor predictions
SGD_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=sgd_predictions)

# Print the MAPE score
print("MAPE:", round(SGD_mape, 3))

# Calculate R-squared (coefficient of determination) for SGD Regressor predictions
SGR_r2 = r2_score(y_true=y_test, y_pred=sgd_predictions)

# Print the R-squared score
print("R-squared:", round(SGR_r2, 3))

MAPE: 0.115
R-squared: 0.851


## LinearRegression

In [630]:
# Create a Linear Regression pipeline that includes preprocessing steps
lr_pipeline = make_pipeline(preprocessor, LinearRegression())

# Fit the Linear Regression pipeline on the training data
lr_pipeline.fit(X_train, y_train)

# Make predictions on the testing dataset using the trained Linear Regression model
lr_predictions = lr_pipeline.predict(X_test)

# Calculate Mean Absolute Percentage Error (MAPE) for Linear Regression predictions
lr_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=lr_predictions)

# Print the MAPE score
print("MAPE:", round(lr_mape, 3))

# Calculate R-squared (coefficient of determination) for Linear Regression predictions
lr_r2 = r2_score(y_true=y_test, y_pred=lr_predictions)

# Print the R-squared score
print("R-squared:", round(lr_r2, 3))

MAPE: 0.111
R-squared: 0.881


# Comparison Models

In [633]:
# Get predictions for different models on the testing dataset
y_test_pred_tree = search.predict(X_test)  # Decision Tree
y_test_pred_knn = search2.predict(X_test)  # K-Nearest Neighbors
y_test_pred_rf = search3.predict(X_test)    # Random Forest
y_test_pred_SGDR = sgd_pipeline.predict(X_test)  # Stochastic Gradient Descent Regressor
y_test_pred_lr = lr_pipeline.predict(X_test)    # Linear Regression

# Calculate R-squared scores for each model's predictions
baseline_tree_r2 = r2_score(y_test, y_test_pred_tree)        # R-squared for Decision Tree
baseline_knn_r2 = r2_score(y_test, y_test_pred_knn)          # R-squared for K-Nearest Neighbors
baseline_rf = r2_score(y_test, y_test_pred_rf)               # R-squared for Random Forest
baseline_SGDR = r2_score(y_test, y_test_pred_SGDR)           # R-squared for Stochastic Gradient Descent Regressor
baseline_lr = r2_score(y_test, y_test_pred_lr)               # R-squared for Linear Regression

# Create a DataFrame to compare the performance of different models
performances = pd.DataFrame({'decision_tree': round(baseline_tree_r2, 3),
                             'knn': round(baseline_knn_r2, 3), 
                             'RF': round(baseline_rf, 3),
                             'SGDR': round(baseline_SGDR, 2),
                             'LR': round(baseline_lr, 3)},
                            index=['baseline'])

# Print the DataFrame with R-squared scores
print(performances)


Unnamed: 0,decision_tree,knn,RF,SGDR,LR
baseline,0.392,0.149,0.876,0.85,0.881


With out Threshold and Skaler

| Model       | Decision Tree | KNN   | Random Forest | SGDR  | Logistic Regression |
|-------------|---------------|-------|---------------|-------|----------------------|
| Baseline    | 0.372         | -0.229 | 0.697         | -3.505539e+22  | 0.884               |

With Threshold and Skaler

| Model       | Decision Tree | KNN   | Random Forest | SGDR  | Logistic Regression |
|-------------|---------------|-------|---------------|-------|----------------------|
| Baseline    | 0.454         | 0.108 | 0.595         | 0.84  | 0.866                |


With Threshold and Skaler and GridSearh

| Model       | Decision Tree | KNN   | Random Forest | SGDR  | Logistic Regression |
|-------------|---------------|-------|---------------|-------|----------------------|
| Baseline    | 0.31           | 0.64 | 0.7           | 0.87  |   -7.169538e+19          |

# Downloand

In [655]:
# Define X_sumbmition as the test_housing_data
X_sumbmition = test_housing_data

# Use the trained ETR pipeline to predict SalePrice for X_sumbmition
ids_com['SalePrice'] = ETR_pipeline.predict(X_sumbmition)

# Save the predictions to a CSV file named 'submission_ert_old.csv'
ids_com.to_csv(r'submission_ert_old.csv', index=False)