In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('housing_prices7.csv')
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
y = data.pop('SalePrice')
X = data.drop(columns=['Id'])
X_num = X.select_dtypes(include='number')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)

In [5]:
# # Impute missing values
# from sklearn.impute import SimpleImputer

# my_imputer = SimpleImputer().set_output(transform='pandas') # initialize
# my_imputer.fit(X_num_train) # fit on the train set
# X_num_imputed_train = my_imputer.transform(X_num_train) # transform the train set
# X_num_imputed_test = my_imputer.transform(X_num_test) # transform the test set

In [20]:
# transforming and creating the pipes
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

qualities = ['None', 'TA', 'Po', 'Fa', 'Gd', 'Ex']

numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean')
)
categorical_pipe_ordinal = make_pipeline(
    (SimpleImputer(strategy='constant',fill_value='None')),
    (OrdinalEncoder(categories=[qualities]*10))
)
categorical_pipe_onehot = make_pipeline(
    (SimpleImputer(strategy='constant',fill_value='NaN')),
    (OneHotEncoder(handle_unknown='ignore',sparse_output=False))
)


preprocessor = make_column_transformer(
    (numeric_pipe,make_column_selector(dtype_include='number')),
    (categorical_pipe_ordinal,make_column_selector(dtype_include='category')),
    (categorical_pipe_onehot,make_column_selector(dtype_include='object'))
)

In [21]:
dt_pipeline = make_pipeline(
                            preprocessor, 
                            DecisionTreeRegressor()).set_output(transform='pandas')

dt_pipeline

In [22]:
dt_pipeline.fit(X_train, y_train)

In [23]:
dt_predictions = dt_pipeline.predict(X_test)

In [24]:
from sklearn.metrics import mean_squared_log_error
mean_squared_log_error(y_true = y_test, y_pred = dt_predictions, squared=False)

0.20108681546498416

In [11]:
param_grid = {
    "decisiontreeregressor__criterion": ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    "decisiontreeregressor__splitter": ['best', 'random'],
    "decisiontreeregressor__min_samples_leaf": range(2, 50)
    
}


In [25]:
qualities = ['None', 'TA', 'Po', 'Fa', 'Gd', 'Ex']

numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean')
)
categorical_pipe_ordinal = make_pipeline(
    (SimpleImputer(strategy='constant',fill_value='None')),
    (OrdinalEncoder(categories=[qualities]*10))
)
categorical_pipe_onehot = make_pipeline(
    (SimpleImputer(strategy='constant',fill_value='NaN')),
    (OneHotEncoder(handle_unknown='ignore',sparse_output=False))
)


preprocessor = make_column_transformer(
    (numeric_pipe,make_column_selector(dtype_include='number')),
    (categorical_pipe_ordinal,make_column_selector(dtype_include='category')),
    (categorical_pipe_onehot,make_column_selector(dtype_include='object'))
)


from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
dt_pipeline2 = make_pipeline(
                            preprocessor, 
                            DecisionTreeRegressor()).set_output(transform='pandas')

dt_pipeline2.fit(X_train, y_train)

In [36]:
dt_predictions = dt_pipeline2.predict(X_test)
msle = mean_squared_log_error(y_true = y_test, y_pred = dt_predictions, squared=False)

In [38]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "decisiontreeregressor__criterion": ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    "decisiontreeregressor__splitter": ['best', 'random'],
    "decisiontreeregressor__min_samples_leaf": range(2, 50)
}

grid_search = GridSearchCV(
    dt_pipeline2,
    param_grid=param_grid,
    cv=5,
    verbose=1,
    scoring='neg_mean_squared_log_error'
)

In [39]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


In [40]:
test_predictions = grid_search.predict(X_test)

In [41]:
grid_search.best_estimator_.fit(X,y)

In [28]:
id_column = data_new.pop('Id')

In [31]:
results = pd.DataFrame({'Id':id_column,'Mean_Squared_Log_Error':msle})

In [33]:
from IPython.display import FileLink, FileLinks
results.to_csv('results',index=False)

In [17]:
from sklearn.svm import SVC
qualities = ['None', 'TA', 'Po', 'Fa', 'Gd', 'Ex']

numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean')
)
categorical_pipe_ordinal = make_pipeline(
    (SimpleImputer(strategy='constant',fill_value='None')),
    (OrdinalEncoder(categories=[qualities]*10))
)
categorical_pipe_onehot = make_pipeline(
    (SimpleImputer(strategy='constant',fill_value='NaN')),
    (OneHotEncoder(handle_unknown='ignore',sparse_output=False))
)


preprocessor = make_column_transformer(
    (numeric_pipe,make_column_selector(dtype_include='number')),
    (categorical_pipe_ordinal,make_column_selector(dtype_include='category')),
    (categorical_pipe_onehot,make_column_selector(dtype_include='object'))
)


from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
dt_pipeline3 = make_pipeline(
                            preprocessor, 
                            SVC()).set_output(transform='pandas')

dt_pipeline3.fit(X_train, y_train)

In [18]:
dt_predictions = dt_pipeline3.predict(X_test)
mean_squared_log_error(y_true = y_test, y_pred = dt_predictions, squared=False)

0.43454847937900054

In [None]:
start_time = time.time()

model_pipeline = make_pipeline(StandardScaler(),
                               PCA(n_components=None),
                               RandomForestRegressor(criterion='friedman_mse',
                                                    max_depth= 30,
                                                    n_estimators= 30,
                                                    min_samples_leaf= 2))

model_pipeline.fit(X_num_imputed_train, y_train)

end_time = time.time()

time_taken_retrain_pca = end_time - start_time
best_score_retrain_pca = model_pipeline.score(X_num_imputed_train, y_train)

print("--- %s seconds ---" % (time_taken_retrain_pca))
print(f"{round(best_score_retrain_pca * 100, 2)}% accuracy")

In [None]:
start_time = time.time()

model_pipeline = make_pipeline(StandardScaler(),
                               PCA(n_components=None),
                               RandomForestRegressor(criterion='friedman_mse',
                                                    max_depth= 30,
                                                    n_estimators= 30,
                                                    min_samples_leaf= 2))

model_pipeline.fit(X_num_imputed_train, y_train)

end_time = time.time()

time_taken_retrain_pca = end_time - start_time
best_score_retrain_pca = model_pipeline.score(X_num_imputed_test, y_test)

print("--- %s seconds ---" % (time_taken_retrain_pca))
print(f"{round(best_score_retrain_pca * 100, 2)}% accuracy")

In [None]:
data_new = pd.read_csv('housing-classification-iter66.csv')

In [None]:
id_column = data_new.pop('Id')