In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDRegressor

In [49]:
# Define a function to generate a shareable link from a Google Drive file ID
def gd_path(file_id):
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Define a dictionary of file IDs for different datasets
files_id = {
    'housing_data': "1kX_jcLeBpBGvTo8FXDeU2MK-aw1a0voU",  # File ID for housing data
    'test_housing_data': "1CMsAWhWKWBjI6DDEHtcYmRVZRazfE9bo",  # File ID for test housing data
    'ids_com': "10gwiL49calkj-xbx-3rQEK4H2zoJcU11"  # File ID for project commit IDs
}

# Read the housing data from Google Drive using the provided file ID
housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")

# Read the test housing data from Google Drive using the provided file ID
test_housing_data = pd.read_csv(gd_path(files_id['test_housing_data']), sep=",")

# Read the project commit IDs from Google Drive using the provided file ID
ids_com = pd.read_csv(gd_path(files_id['ids_com']), sep=",")

# Assign the housing data DataFrame to a variable 'df' for further processing
df = housing_data

# Pre-Processing Pipe

## Split

In [50]:
# Remove the "SalePrice" column from the DataFrame 'df' and assign it to 'y' (target variable)
y = df.pop("SalePrice")

# Assign the remaining columns in 'df' to 'X' as the features
X = df

# Split the data into training and testing sets
# X_train and y_train will be used for training the model, and X_test and y_test for evaluation
# test_size=0.2 means 20% of the data will be used for testing, and random_state=123 is for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Handling data types

In [51]:
# Separate categorical and numerical features from X
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# Define a pipeline for numerical features
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean")
)

# Define columns that are treated as ordinal
ordinal_cols = X_cat.columns.get_indexer(['LandContour', 'LandSlope',
                     'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                     'HeatingQC', 'KitchenQual', 'Functional',
                     'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PoolQC'])

# Define columns that are one-hot encoded
onehot_cols = X_cat.columns.get_indexer(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 
                                         'Foundation', 'Alley', 'LotShape', 'Utilities', 'LotConfig', 
                                         'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 
                                         'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                                         'Electrical', 'GarageType', 'PavedDrive', 'Fence', 'MiscFeature',
                                         'SaleType', 'SaleCondition'])



# Define the categories for ordinal features
# These categories are manually established, including "N_A"
LandContour = ['Lvl', 'Bnk', 'HLS', 'Low']
LandSlope = ['Gtl', 'Mod', 'Sev']
ExterQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ExterCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
BsmtQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
BsmtCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
BsmtExposure = ['Gd', 'Av', 'Mn', 'No', 'N_A']
BsmtFinType1 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A']
BsmtFinType2 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A']
HeatingQC = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
KitchenQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
Functional = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
FireplaceQu = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
GarageFinish = ['Fin', 'RFn', 'Unf', 'N_A']
GarageQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
GarageCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A']
PoolQC = ['Ex', 'Gd', 'TA', 'Fa', 'N_A']


# Create a ColumnTransformer for categorical features
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=[LandContour, LandSlope, ExterQual, ExterCond, BsmtQual, BsmtCond,
                                                    BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual,
                                                    Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond, PoolQC]), ordinal_cols),

        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols)
    ]
)

# Define a pipeline for categorical features
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    categorical_encoder 
)

# Create a preprocessor that handles both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)   

# Modelling Pipe

## Decisiontree - Creation

In [None]:
# Create a full pipeline that includes the preprocessing steps and a DecisionTreeClassifier
full_pipeline = make_pipeline(preprocessor, DecisionTreeClassifier())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

# Create a GridSearchCV object to search for the best hyperparameters
search = GridSearchCV(full_pipeline, param_grid, cv=5, verbose=1)

# Fit the GridSearchCV object on the training data
search.fit(X_train, y_train)

# Get the best cross-validation score from the grid search
best_score = search.best_score_

# Store the best score in a dictionary
scores = {"dtree": best_score}


## Decisiontree - Analysis

In [53]:
# Get predictions for the training and testing datasets using the best model from the GridSearchCV
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices for training and testing datasets
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

# Print the training and testing accuracies
print("Training Accuracy:", round(accuracy_train, 3))
print("Testing Accuracy:", round(accuracy_test, 3), "\n")

# Calculate R-squared (coefficient of determination) for the model's predictions
dtree_r2 = r2_score(y_true=y_test, y_pred=y_test_pred)

# Print the R-squared score
print("R-squared:", round(dtree_r2, 3))

# Calculate Mean Absolute Percentage Error (MAPE) for the model's predictions
dtree_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)

# Print the MAPE score
print("MAPE:", round(dtree_mape, 3))

Training Accuracy: 0.149
Testing Accuracy: 0.003 

R-squared: 0.375
MAPE: 0.191


## KNN - Creation


In [None]:
# Create a full pipeline that includes the preprocessing steps and a KNeighborsClassifier
full_pipeline = make_pipeline(preprocessor, KNeighborsClassifier())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(3, 20, 2)
}

# Create a GridSearchCV object to search for the best hyperparameters
search2 = GridSearchCV(full_pipeline, param_grid, cv=10, verbose=1)

# Fit the GridSearchCV object on the training data
search2.fit(X_train, y_train)

# Get the best cross-validation score from the grid search for KNN
best_score_knn = search2.best_score_

# Store the best score in a dictionary
scores2 = {"knn": best_score_knn}

## KNN - Anylysis

In [55]:
# Get predictions for the training and testing datasets using the best KNN model from the GridSearchCV
y_train_pred = search2.predict(X_train)
y_test_pred = search2.predict(X_test)

# Create confusion matrices for training and testing datasets
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

# Print the training and testing accuracies
print("Training Accuracy:", round(accuracy_train, 3))
print("Testing Accuracy:", round(accuracy_test, 3), "\n")

# Calculate R-squared (coefficient of determination) for the KNN model's predictions
knn_r2 = r2_score(y_true=y_test, y_pred=y_test_pred)

# Print the R-squared score
print("R-squared:", round(knn_r2, 3))

# Calculate Mean Absolute Percentage Error (MAPE) for the KNN model's predictions
knn_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)

# Print the MAPE score
print("MAPE:", round(knn_mape, 3))

Training Accuracy: 0.191
Testing Accuracy: 0.007 

R-squared: 0.056
MAPE: 0.253


## RandomForest- Creation

In [None]:
# Create a full pipeline that includes the preprocessing steps and a RandomForestClassifier
full_pipeline = make_pipeline(preprocessor, RandomForestClassifier())

# Define a parameter grid for hyperparameter tuning
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["median"],
    "randomforestclassifier__n_estimators": [200, 500],
    "randomforestclassifier__max_features": ['sqrt', 'log2'],
    'randomforestclassifier__max_depth': [4, 5, 6, 7, 8],
    'randomforestclassifier__criterion': ['gini', 'entropy']
}

# Create a GridSearchCV object to search for the best hyperparameters
search3 = GridSearchCV(full_pipeline, param_grid, cv=5, verbose=1)

# Fit the GridSearchCV object on the training data
search3.fit(X_train, y_train)

# Get the best cross-validation score from the grid search for Random Forest
best_score_rf = search3.best_score_

# Store the best score in a dictionary
scores3 = {"rf": best_score_rf}

# Print the best parameters and the corresponding score
print("Best parameters: ", search3.best_params_)
print("Best score:", best_score_rf)

## RandomForest - Analysis

In [57]:
# Get predictions for the training and testing datasets using the best Random Forest model from the GridSearchCV
y_train_pred = search3.predict(X_train)
y_test_pred = search3.predict(X_test)

# Create confusion matrices for training and testing datasets
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

# Print the training and testing accuracies
print("Training Accuracy:", round(accuracy_train, 3))
print("Testing Accuracy:", round(accuracy_test, 3), "\n")

# Calculate R-squared (coefficient of determination) for the Random Forest model's predictions
rf_r2 = r2_score(y_true=y_test, y_pred=y_test_pred)

# Print the R-squared score
print("R-squared:", round(rf_r2, 3))

# Calculate Mean Absolute Percentage Error (MAPE) for the Random Forest model's predictions
rf_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)

# Print the MAPE score
print("MAPE:", round(rf_mape, 3))

Training Accuracy: 0.545
Testing Accuracy: 0.027 

R-squared: 0.633
MAPE: 0.151


## SGDRegressor

In [58]:
# Create a pipeline for SGDRegressor that includes preprocessing steps
sgd_pipeline = make_pipeline(preprocessor, SGDRegressor())

# Fit the SGDRegressor pipeline on the training data
sgd_pipeline.fit(X_train, y_train)

# Make predictions on the testing data using SGDRegressor
sgd_predictions = sgd_pipeline.predict(X_test)

# Calculate the Mean Absolute Percentage Error (MAPE) for SGDRegressor predictions
SGD_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=sgd_predictions)

# Print the MAPE score
print("MAPE:", round(SGD_mape, 3))

# Calculate R-squared (coefficient of determination) for SGDRegressor predictions
SGR_r2 = r2_score(y_true=y_test, y_pred=sgd_predictions)

# Print the R-squared score
print("R-squared:", round(SGR_r2, 3))

MAPE: 0.151
R-squared: -1.8362582799528853e+23


## LinearRegression

In [59]:
# Create a pipeline for Linear Regression that includes preprocessing steps
lr_pipeline = make_pipeline(preprocessor, LinearRegression())

# Fit the Linear Regression pipeline on the training data
lr_pipeline.fit(X_train, y_train)

# Make predictions on the testing data using Linear Regression
lr_predictions = lr_pipeline.predict(X_test)

# Calculate the Mean Absolute Percentage Error (MAPE) for Linear Regression predictions
lr_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=lr_predictions)

# Print the MAPE score
print("MAPE:", round(lr_mape, 3))

# Calculate R-squared (coefficient of determination) for Linear Regression predictions
lr_r2 = r2_score(y_true=y_test, y_pred=lr_predictions)

# Print the R-squared score
print("R-squared:", round(lr_r2, 3))

MAPE: 0.151
R-squared: 0.884


# Downloand

In [None]:
# Select the portion of the 'housing_data' DataFrame for submission (assuming it contains features for submission)
X_submission = housing_data[0:1459]

# Make predictions using the trained model (in this case, 'search4') on the submission data
ids_com['Expensive'] = search4.predict(X_submission)  # Change the pipeline to 'search4'

# Save the predictions and the 'ids_com' DataFrame to a CSV file
ids_com.to_csv(r'submission_9.csv', index=False)  # Change the file name as needed