In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
# from AntiPythonicFunctions import overview, feature_analyzer, feature_analyzer_iterator # another sort of bugfix to do
# new stuff for playing around:
import sweetviz as sv # Epic feature analysis as .html
from dataprep.eda import plot_correlation # Neat correlation plotting
# from lazypredict.Supervised import LazyRegressor, LazyClassifier # Uber-modelling
# sklearn:
from sklearn import set_config
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer, OrdinalEncoder, OneHotEncoder 
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, mean_absolute_percentage_error, mean_squared_log_error
from xgboost import XGBRegressor

# .csv
train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [None]:
# Global setting for sklearn:
set_config(transform_output='pandas')
# Numpy random state:
np.random.seed(1337) # Either be a 1337 Coder or use 42 here. 

In [None]:
# Get index out of our way:
train_df.set_index('Id', inplace=True)
test_df.set_index('Id', inplace=True)

In [None]:
# As well as useless columns concluded from previous EDA(see again at plot_correlation and missing values):
train_df.drop(['MSSubClass',
               'MoSold',
               'Fence',
               'Alley',
               'PoolArea',
               'MiscVal',
               'YrSold',
               'OverallCond',
               'EnclosedPorch',
               'LowQualFinSF',
               'KitchenAbvGr',
               '3SsnPorch',
               'ScreenPorch',
               'BsmtHalfBath',
               'GarageYrBlt',
               'MiscFeature',
               'BsmtExposure',
               'BsmtFinType1',
               'BsmtFinType2',
               'GarageFinish',
               'SaleType',
               'SaleCondition'],
              axis=1,
              inplace=True)
test_df.drop(['MSSubClass',
               'MoSold',
               'Fence',
               'Alley',
               'PoolArea',
               'MiscVal',
               'YrSold',
               'OverallCond',
               'EnclosedPorch',
               'LowQualFinSF',
               'KitchenAbvGr',
               '3SsnPorch',
               'ScreenPorch',
               'BsmtHalfBath',
               'GarageYrBlt',
               'MiscFeature',
               'BsmtExposure',
               'BsmtFinType1',
               'BsmtFinType2',
               'GarageFinish',
               'SaleType',
               'SaleCondition'],
             axis=1, inplace=True)

In [None]:
# Just for further investigations:
numeric_train = train_df.select_dtypes(include=['int64', 'float64'])
categoric_train = train_df.select_dtypes(include=['object'])

In [None]:
# display(numeric_train.isnull().sum())
# plot_correlation(numeric_train, 'SalePrice')

In [None]:
# display(categoric_train.isnull().sum())

In [None]:
# train_df.hist(bins=50,figsize=(20,20))
# plt.show()

In [None]:
#To reduce repetitive usage of the "feature_analyzer", an iterator to use on all columns.
#Loop through all columns except from our targeted one
def feature_analyzer_iterator(df, target_variable):
    for feature_name in df.columns:
        if feature_name != target_variable:
            sns.jointplot(data=df, y=target_variable, x=feature_name)
            plt.show()

            # Description of feature in df.
            print("\n****Data DF Info****")
            print("Description of {} in df:".format(feature_name))
            print(df[feature_name].describe())

            # Value counts of feature in df.
            print("\n****Data DF Value Counts****")
            print("Value counts of {} in df:".format(feature_name))
            print(df[feature_name].value_counts())

            # Mean target_variable value by feature in df.
            print("\n****Data DF Mean {} by {}****".format(target_variable, feature_name))
            print("Mean {} by {} in df:".format(target_variable, feature_name))
            print(df.groupby(feature_name)[target_variable].mean())

            # Skewness of feature in df.
            if df[feature_name].dtype!="O":
                print("\nSkewness:",str(skew(df[feature_name])))
                
# feature_analyzer_iterator(train_df, 'SalePrice') # Use with caution; complete mindfuck, have fun reading

In [None]:
# Now just from notebook:

# categories_ordinal = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond','PoolQC']

# for col in categories_ordinal:
#     train_df[col] = train_df[col].astype('category')



In [None]:
X = train_df.copy()
y = X.pop('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1337)

In [None]:
numeric_pipe = make_pipeline(
    KNNImputer(n_neighbors=5))

categoric_pipe = make_pipeline(
    KNNImputer(n_neighbors=5),
    OrdinalEncoder()
)    
categoric_pipe2 = make_pipeline(
    KNNImputer(n_neighbors=5),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False)
)

preprocessor = make_column_transformer(
        (numeric_pipe,make_column_selector(dtype_include='number')),
        (categoric_pipe, make_column_selector(pattern='ExterQual|ExterCond|BsmtQual|BsmtCond|HeatingQC|KitchenQual|FireplaceQu|GarageQual|GarageCond|PoolQC')),
        (categoric_pipe2, make_column_selector(dtype_include='object'))
)

scaler = StandardScaler()

pipe = make_pipeline(
    (preprocessor),
    (scaler),
    (HistGradientBoostingRegressor(random_state=1337))
)
pipe

In [None]:
pipe.fit(X_train,y_train)

In [None]:
param_grid = {
    'HistGradientBoostingRegressor__max_depth':range(5,20),
    'HistGradientBoostingRegressor__min_samples_leaf':[2,5,20],
    #'HistGradientBoostingRegressor__n_estimators':[5,25,100]
}

grid_search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    verbose=1.1,
    scoring='neg_mean_squared_log_error'
)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
test_predictions = grid_search.predict(X_test)
print(mean_absolute_error(y_test,test_predictions))
print(mean_squared_error(y_test,test_predictions))
print(mean_squared_log_error(y_test,test_predictions,squared=False))
print(mean_absolute_percentage_error(y_test,test_predictions))

In [None]:
grid_search.best_estimator_.fit(X,y)


In [None]:
url = "https://drive.google.com/file/d/1Z4EAnUyTS3rLKq9ZW7OTCOlPh3fZQ5Mq/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data_new = pd.read_csv(path)
id_column = data_new.pop('Id')
predictions = grid_search.best_estimator_.predict(data_new)
results = pd.DataFrame({'Id':id_column,'SalePrice':predictions})
results

In [None]:
results.to_csv('test3.csv',index=False)
