# ToDos:
    1. Remove cardinality (combine low occuring cat variables in columns into "Other")

In [310]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xgboost as xgb
import sklearn
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras import Sequential, layers, Model
from sklearn.metrics import mean_absolute_error

In [311]:
train_df = pd.read_csv("data/train.csv", index_col="Id")
test_df = pd.read_csv("data/test.csv", index_col="Id")

train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [312]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


## Split the data

In [313]:
X = data.drop(columns="SalePrice")
y = data["SalePrice"]

In [314]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)
for k, v in {"X_train": X_train, "y_train": y_train, "X_val": X_val, "y_val": y_val}.items():
    print(f"{k} shape = {v.shape}")


X_train shape = (1022, 71)
y_train shape = (1022,)
X_val shape = (438, 71)
y_val shape = (438,)


# Data Preprocessing
    - EDA is done in another notebook (EDA.ipynb)

In [315]:
# Create a copy of the training df to test preprocessing
data = train_df.copy()

In [316]:
def preprocess_data(df):
    # Drop columns with bad or unuseful data
    columns_to_drop = ["Street", "Alley","Utilities", "Condition1", "Condition2", "RoofMatl", "PoolQC", "MiscFeature"]
    df = df.drop(columns=columns_to_drop)
    
    # Transform some numerical columns to remove high skewness
    cols_to_transform = ["MasVnrArea", "BsmtFinSF1", "BsmtUnfSF", "2ndFlrSF", "WoodDeckSF",
                        "OpenPorchSF", "BsmtUnfSF", "BsmtFinSF2", "1stFlrSF", "2ndFlrSF", 
                        "GrLivArea", "MiscVal"]
    
    for i in cols_to_transform:
        df[i] = np.sqrt(df[i] + 1e-8)  
        
    # Reduce cardinality
    rarely_occuring = ["WdShing", "Stucco", "BrkComm", "AsbShng", "Stone", "ImStucc", "CBlock"]

    for i in rarely_occuring:
        df["Exterior1st"] = df["Exterior1st"].str.replace(i, "Other")
    
    return df
    
    
    

In [317]:
data = preprocess_data(data)
data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,BldgType,...,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Reg,Lvl,Inside,Gtl,CollgCr,1Fam,...,0,0,0,,0.0001,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Reg,Lvl,FR2,Gtl,Veenker,1Fam,...,0,0,0,,0.0001,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,IR1,Lvl,Inside,Gtl,CollgCr,1Fam,...,0,0,0,,0.0001,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,IR1,Lvl,Corner,Gtl,Crawfor,1Fam,...,0,0,0,,0.0001,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,IR1,Lvl,FR2,Gtl,NoRidge,1Fam,...,0,0,0,,0.0001,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Reg,Lvl,Inside,Gtl,Gilbert,1Fam,...,0,0,0,,0.0001,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Reg,Lvl,Inside,Gtl,NWAmes,1Fam,...,0,0,0,MnPrv,0.0001,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Reg,Lvl,Inside,Gtl,Crawfor,1Fam,...,0,0,0,GdPrv,50.0000,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Reg,Lvl,Inside,Gtl,NAmes,1Fam,...,0,0,0,,0.0001,4,2010,WD,Normal,142125


In [318]:
num_data = X.select_dtypes(exclude="object").columns.to_list()
categorical_cols = [cname for cname in data.columns if
                    data[cname].nunique() < 10 and 
                    data[cname].dtype == "object"]

In [319]:
cat_pipeline = Pipeline(steps=[
    #("imputer", KNNImputer(n_neighbors=5)),
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

In [320]:
num_pipeline = Pipeline(steps=[
    ("imputer", KNNImputer(n_neighbors=5))
])

In [321]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_data),
    ("cat", cat_pipeline, cat_data)
])

In [322]:
def get_scores(n_estimators):
    model = xgb.XGBRegressor(n_estimators=n_estimators, 
                             learning_rate=0.01, 
                             random_state=42,
                            n_jobs=-1)
    
    modeling_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    modeling_pipeline.fit(X_train, y_train)
    
    val_preds = modeling_pipeline.predict(X_val)
    val_score = mean_absolute_error(y_val, val_preds)
    
    train_preds = modeling_pipeline.predict(X_train)
    train_score = mean_absolute_error(y_train, train_preds)
    
    return val_score, train_score

In [323]:
for i in range(10, 20):
    val_score, train_score = get_scores(i * 50)
    print(f"{i * 50} | Train Score: {train_score} | Val Score: {val_score} ")

500 | Train Score: 6100.809568401419 | Val Score: 18454.261674158104 
550 | Train Score: 5697.189219973092 | Val Score: 18345.049479166668 
600 | Train Score: 5402.247179244129 | Val Score: 18248.20514055365 
650 | Train Score: 5162.271946856654 | Val Score: 18177.79172017694 
700 | Train Score: 4949.475771312378 | Val Score: 18103.04749928653 
750 | Train Score: 4771.476933249755 | Val Score: 18054.504334332192 
800 | Train Score: 4613.012731623043 | Val Score: 18005.790864012557 
850 | Train Score: 4423.813872156311 | Val Score: 17967.459983233446 
900 | Train Score: 4227.57514753547 | Val Score: 17926.86815960331 
950 | Train Score: 4067.938486117906 | Val Score: 17889.172463613013 


In [None]:
model = xgb.XGBRegressor(n_estimators=1000, 
                         learning_rate=0.01, 
                         random_state=42,
                        n_jobs=-1)

modeling_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


scores = -1 * cross_val_score(modeling_pipeline, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")

scores

In [None]:
model = xgb.XGBRegressor(n_estimators=1000, 
                         learning_rate=0.01, 
                         random_state=42,
                        n_jobs=-1)

modeling_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

modeling_pipeline.fit(X_train, y_train)

# Making predictions on the test data

In [None]:
X_test = preprocess_data(test_df)

In [None]:
preds_test = modeling_pipeline.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

In [None]:
output