In [587]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel
from sklearn.linear_model import LinearRegression, SGDRegressor

In [588]:
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

files_id = {
    'housing_data':"1kX_jcLeBpBGvTo8FXDeU2MK-aw1a0voU", #iteration7 data
    'test_housing_data':"1CMsAWhWKWBjI6DDEHtcYmRVZRazfE9bo", #test data for housing
    'ids_com':"10gwiL49calkj-xbx-3rQEK4H2zoJcU11" #ID for the commiting of the project

}

housing_data = pd.read_csv(gd_path(files_id['housing_data']), sep=",")
test_housing_data = pd.read_csv(gd_path(files_id['test_housing_data']), sep=",")
ids_com = pd.read_csv(gd_path(files_id['ids_com']), sep=",")
df = housing_data

# Pre-Processing Pipe

## Ordinary columns

This part was done by hands. Many Bothans died to bring us this information.

In [589]:
from sklearn.model_selection import train_test_split

# X and y creation
y = df.pop("SalePrice")

# Feature Engineering
X = df

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [590]:
# Separating numerical features from your training and testing datasets
X_num_train = X_train.select_dtypes(include="number").copy()
X_num_test = X_test.select_dtypes(include="number").copy()


# Define ordinal and one-hot encoding columns
ordinal_cols = X_train.columns.get_indexer(['LandContour', 'LandSlope',
                     'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                     'HeatingQC', 'KitchenQual', 'Functional',
                     'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PoolQC'])

onehot_cols = X_train.columns.get_indexer(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 
                                         'Foundation', 'Alley', 'LotShape', 'Utilities', 'LotConfig', 
                                         'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 
                                         'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                                         'Electrical', 'GarageType', 'PavedDrive', 'Fence', 'MiscFeature',
                                         'SaleType', 'SaleCondition'])

# Manually establish the order of categories for ordinal features, including "N_A"
ordinal_categories = [['Lvl', 'Bnk', 'HLS', 'Low', 'N_A'],
                      ['Gtl', 'Mod', 'Sev', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Gd', 'Av', 'Mn', 'No', 'N_A'],
                      ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A'],
                      ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Fin', 'RFn', 'Unf', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'N_A'],
                      ['Ex', 'Gd', 'TA', 'Fa', 'N_A']]

# Create a categorical encoder
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols)
    ]
)

# Create pipelines for numerical and categorical features
numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    SelectKBest(score_func=f_regression, k=12),
    MinMaxScaler(),
    VarianceThreshold(threshold=0.005)
)

categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    categorical_encoder
)

# Create a preprocessor that applies the appropriate pipeline to each feature type
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_train.columns),
        ("cat_pipe", categoric_pipe, X_train.columns)
    ]
)



# Models

## Decisiontree

In [591]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier())

'''param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean","median"],
    "decisiontreeclassifier__max_depth": range(1, 21),
    "decisiontreeclassifier__min_samples_leaf": range(1, 21)
}'''

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean"],
    "decisiontreeclassifier__max_depth": [18],
    "decisiontreeclassifier__min_samples_leaf": [10]
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

scores = {"dtree" : search.best_score_}
print("Best parameters: ", search.best_params_)

scores

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best parameters:  {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'decisiontreeclassifier__max_depth': 18, 'decisiontreeclassifier__min_samples_leaf': 10}


{'dtree': 0.015413961336708118}

## Decisiontree - Analysis

In [592]:
# Get predictions for the training and testing datasets
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", round(accuracy_train,3))
print("Testing Accuracy:", round(accuracy_test,3),"\n")


dtree_r2 = r2_score(y_true = y_test,
                    y_pred = y_test_pred)

print("R-squared:", round(dtree_r2,3))

dtree_r2 = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(dtree_r2,3))

Training Accuracy: 0.169
Testing Accuracy: 0.01 

R-squared: 0.393
MAPE: 0.177


## KNN


In [593]:
# Modeling Pipe - 2

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


full_pipeline = make_pipeline(preprocessor,
                              KNeighborsClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean"],
    "kneighborsclassifier__n_neighbors": range(2,29,2)
}

search2 = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1)

search2.fit(X_train, y_train)

scores2 = {"knn" : search2.best_score_}
print("Best parameters: ", search2.best_params_)

scores2


Fitting 10 folds for each of 14 candidates, totalling 140 fits




Best parameters:  {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'kneighborsclassifier__n_neighbors': 28}


{'knn': 0.009401709401709403}

## KNN - Analysis

In [594]:
# Get predictions for the training and testing datasets
y_train_pred = search2.predict(X_train)
y_test_pred = search2.predict(X_test)

# Create confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

# Calculate accuracy for training and testing datasets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", round(accuracy_train,3))
print("Testing Accuracy:", round(accuracy_test,3),"\n")


dtree_r2 = r2_score(y_true = y_test,
                    y_pred = y_test_pred)

print("R-squared:", round(dtree_r2,3))

dtree_r2 = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(dtree_r2,3))

Training Accuracy: 0.071
Testing Accuracy: 0.014 

R-squared: 0.149
MAPE: 0.222


## RandomForest

In [612]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

full_pipeline = make_pipeline(preprocessor, RandomForestRegressor())

'''param_grid = {
    'columntransformer__num_pipe__simpleimputer__strategy': ['median','mean'],
    'columntransformer__num_pipe__selectkbest__k': range(5,30,2),
    'columntransformer__num_pipe__variancethreshold__threshold':[0.005, 0.01, 0.02, 0.03],
    'randomforestregressor__n_estimators': [200, 500], 
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 
    'randomforestregressor__max_depth': [4, 5, 6, 7, 8], 
    'randomforestregressor__criterion': ['mse', 'mae', 'poisson', 'friedman_mse']
}'''

param_grid = {
    'columntransformer__num_pipe__selectkbest__k': [19], 
    'columntransformer__num_pipe__simpleimputer__strategy': ['median'], 
    'columntransformer__num_pipe__variancethreshold__threshold': [0.005], 
    'randomforestregressor__criterion': ['poisson'], 
    'randomforestregressor__max_depth': [8], 
    'randomforestregressor__max_features': ['auto'], 
    'randomforestregressor__n_estimators': [200]}


search3 = GridSearchCV(full_pipeline, param_grid, cv=5, verbose=1)

search3.fit(X_train, y_train)

scores3 = {"rf": search3.best_score_}
print("Best parameters: ", search3.best_params_)
print(scores3)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters:  {'columntransformer__num_pipe__selectkbest__k': 19, 'columntransformer__num_pipe__simpleimputer__strategy': 'median', 'columntransformer__num_pipe__variancethreshold__threshold': 0.005, 'randomforestregressor__criterion': 'poisson', 'randomforestregressor__max_depth': 8, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__n_estimators': 200}
{'rf': 0.8326449548383461}


Best parameters:  {'randomforestregressor__criterion': 'poisson', 'randomforestregressor__max_depth': 8, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__n_estimators': 200}
{'rf': 0.8212447308058735}

Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
Best parameters:  {
    'columntransformer__num_pipe__selectkbest__k': 19, 
    'columntransformer__num_pipe__simpleimputer__strategy': 'median', 'columntransformer__num_pipe__variancethreshold__threshold': 0.005, 
    'randomforestregressor__criterion': 'poisson', 
    'randomforestregressor__max_depth': 8, 
    'randomforestregressor__max_features': 'auto', 
    'randomforestregressor__n_estimators': 200}
{'rf': 0.8384705128363066}

## RandomForest - Analysis

In [602]:
# Get predictions for the training and testing datasets
y_train_pred = search3.predict(X_train)
y_test_pred = search3.predict(X_test)


dtree_r2 = r2_score(y_true = y_test,
                    y_pred = y_test_pred)

print("R-squared:", round(dtree_r2,3))

dtree_r2 = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = y_test_pred)

print("MAPE:", round(dtree_r2,3))

R-squared: 0.871
MAPE: 0.099


### SGDRegressor

In [603]:
from sklearn.linear_model import LinearRegression, SGDRegressor



sgd_pipeline = make_pipeline(preprocessor,
                            SGDRegressor())

sgd_pipeline.fit(X_train, y_train)

sgd_predictions = sgd_pipeline.predict(X_test)

SGD_mape = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = sgd_predictions)

print("MAPE:", round(SGD_mape,3))

SGR_r2 = r2_score(y_true = y_test,
                    y_pred = sgd_predictions)

print("R-squared:", round(SGR_r2, 3))

MAPE: 0.122
R-squared: 0.844


#### With KBest

### LinearRegression

In [604]:
# Scale the data! 

lr_pipeline = make_pipeline(preprocessor,
                            LinearRegression())

lr_pipeline.fit(X_train, y_train)

lr_predictions = lr_pipeline.predict(X_test)

lr_mape = mean_absolute_percentage_error(y_true = y_test,
                    y_pred = lr_predictions)

print("MAPE:", round(lr_mape,3))

lr_r2 = r2_score(y_true = y_test,
                    y_pred = lr_predictions)

print("R-squared:", round(lr_r2,3))

MAPE: 0.111
R-squared: 0.881


# Iteration 0

In [605]:
y_test_pred_tree = search.predict(X_test)
y_test_pred_knn = search2.predict(X_test)
y_test_pred_rf = search3.predict(X_test)
y_test_pred_SGDR = sgd_pipeline.predict(X_test)
y_test_pred_lr = lr_pipeline.predict(X_test)


baseline_tree_r2 = r2_score(y_test, y_test_pred_tree)
baseline_knn_r2 = r2_score(y_test, y_test_pred_knn)
baseline_rf = r2_score(y_test, y_test_pred_rf)
baseline_SGDR = r2_score(y_test, y_test_pred_SGDR)
baseline_lr = r2_score(y_test, y_test_pred_lr)


performances = pd.DataFrame({'decision_tree': round(baseline_tree_r2,3),
                             'knn': round(baseline_knn_r2,3), 
                             'RF': round(baseline_rf,3),
                             'SGDR': round(baseline_SGDR, 2),
                            'LR': round(baseline_lr,3)},
                            index=['baseline'])

performances

Unnamed: 0,decision_tree,knn,RF,SGDR,LR
baseline,0.393,0.149,0.871,0.84,0.881


With out Threshold and Skaler

| Model       | Decision Tree | KNN   | Random Forest | SGDR  | Logistic Regression |
|-------------|---------------|-------|---------------|-------|----------------------|
| Baseline    | 0.372         | -0.229 | 0.697         | -3.505539e+22  | 0.884               |

With Threshold and Skaler

| Model       | Decision Tree | KNN   | Random Forest | SGDR  | Logistic Regression |
|-------------|---------------|-------|---------------|-------|----------------------|
| Baseline    | 0.454         | 0.108 | 0.595         | 0.84  | 0.866                |


With Threshold and Skaler and GridSearh

| Model       | Decision Tree | KNN   | Random Forest | SGDR  | Logistic Regression |
|-------------|---------------|-------|---------------|-------|----------------------|
| Baseline    | 0.31           | 0.64 | 0.7           | 0.87  |   -7.169538e+19          |

In [None]:
print('Readoy')

Readoy


# Downloand

In [613]:
X_sumbmition = test_housing_data
ids_com['SalePrice'] = search3.predict(X_sumbmition) # only cange the piplene! 
ids_com.to_csv(r'submission_rdf_800.csv', index=False) #only cahnge ist