# Purpose
- Do a regression competition: 
    - https://www.kaggle.com/c/house-prices-advanced-regression-techniques
    

- Inspiration (clasification notebooks)
https://github.com/EmilMachine/ml_club/blob/master/05_intro_ml/notebooks/

## Import libs

In [65]:
sklearn.__version__

'0.24.1'

In [67]:
# misc
from IPython.display import display, HTML
import numpy as np

# DATA - prep
#kaggle
import pandas as pd
import sklearn.model_selection

# ML - models 
from sklearn.linear_model import LinearRegression
# ML - accuracy
import sklearn.metrics

# helper

In [94]:

def ds_explore(df_src,target,cols_rm=[],col_id=[],**kwargs):
    unique_lim = kwargs.get("unique_lim",50)
    target=[target]
    
    remove = cols_rm + col_id + target
    cols_num = [i for i in df_src._get_numeric_data().columns if i not in remove]
    cols_str = [i for i in df_src.columns if i not in cols_num + remove]

    unique_count = df_src[cols_str].nunique()
    

    cols_str_cat = list(unique_count[unique_count<unique_lim].index)
    cols_str_nlp = [i for i in cols_str if i not in cols_str_cat]

    meta = {
        "cols_str_cat" : cols_str_cat
        ,"cols_str_nlp" : cols_str_nlp
        ,"cols_num" : cols_num
        ,"col_id": col_id
        ,"target" : target
    }
    # we now have split the dataset in 3 categories
    # num_cols = numeric columns
    # str_cat = string columns that can be treated as categorical
    # str_nlp = string columns that can be treated as nlp

    return meta

## Get data

In [16]:

metadata = {
    'dataset':'house-prices-advanced-regression-techniques',
    'train' : 'train.csv',
    'test' : 'test.csv'}

In [15]:
#!mkdir -p {metadata['basepath']}
!kaggle competitions download -c dataset {metadata['dataset']}
!unzip -o {metadata['dataset']}.zip -d {metadata['dataset']}/
!rm {metadata['dataset']}.zip


Downloading house-prices-advanced-regression-techniques.zip to /Users/emil/hack/ml_club/11_house_regression
  0%|                                                | 0.00/199k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 199k/199k [00:00<00:00, 3.61MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: house-prices-advanced-regression-techniques/data_description.txt  
  inflating: house-prices-advanced-regression-techniques/sample_submission.csv  
  inflating: house-prices-advanced-regression-techniques/test.csv  
  inflating: house-prices-advanced-regression-techniques/train.csv  


In [18]:

# load
train = pd.read_csv("{dataset}/{train}".format(**metadata))
test = pd.read_csv("{dataset}/{test}".format(**metadata))

In [39]:
# na_check
train.columns
# with pd.option_context('display.max_rows', None):
#     display(pd.DataFrame(test.isna().mean() ,columns=["is na fraction"]))

pd.DataFrame(test.isna().mean() ,columns=["is_na_fraction"])

# remove all coulmns with any na
# ---
# tmp = pd.DataFrame(test.isna().mean(),columns=["na_frac"])
# rm_columns_na = list(tmp[tmp["na_frac"]>0].index.values)
#tmp[tmp["na_frac"]>0]
                

In [96]:
# remove all coulmns with any na
tmp = pd.DataFrame(test.isna().mean(),columns=["na_frac"])
rm_columns_na = list(tmp[tmp["na_frac"]>0].index.values)

meta = ds_explore(train
                  ,target="SalePrice"
                  ,cols_rm=rm_columns_na
                  ,col_id=["Id"])
meta

{'cols_str_cat': ['Street',
  'LotShape',
  'LandContour',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'ExterQual',
  'ExterCond',
  'Foundation',
  'Heating',
  'HeatingQC',
  'CentralAir',
  'Electrical',
  'PavedDrive',
  'SaleCondition'],
 'cols_str_nlp': [],
 'cols_num': ['MSSubClass',
  'LotArea',
  'OverallQual',
  'OverallCond',
  'YearBuilt',
  'YearRemodAdd',
  '1stFlrSF',
  '2ndFlrSF',
  'LowQualFinSF',
  'GrLivArea',
  'FullBath',
  'HalfBath',
  'BedroomAbvGr',
  'KitchenAbvGr',
  'TotRmsAbvGrd',
  'Fireplaces',
  'WoodDeckSF',
  'OpenPorchSF',
  'EnclosedPorch',
  '3SsnPorch',
  'ScreenPorch',
  'PoolArea',
  'MiscVal',
  'MoSold',
  'YrSold'],
 'col_id': ['Id'],
 'target': ['SalePrice']}

In [68]:
target = meta["target"]
# keep numeric features without missing vals
keep_features = meta["cols_num"]
seed = 42
test_size = 0.7


y = train[target]
X = train[keep_features]


X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(
    X,
    y,
    random_state = seed,
    test_size = test_size)

# ML 

In [90]:
# Linear regression
model_lin = sklearn.linear_model.LinearRegression()
model_lin.fit(X_train, y_train);

In [89]:
# naive baseline
class naive():
    def __init__(self,avg):
        self.avg = avg
        
    def predict(self, df):
        return self.avg*np.ones(df.shape[0])

model_naive = naive(avg=np.mean(y_train.values))



# eval

In [93]:
models = {
    "model_naive" : model_naive,
    "model_lin" : model_lin
}

for name,model in zip(models.keys(),models.values()):
    mse = sklearn.metrics.mean_squared_error(
     y_true = y_val,
     y_pred = model.predict(X_val)
    )
    # root mean squared error
    rmse = np.power(mse,0.5)
    
    print(name,round(rmse,4))

model_naive 82205.6264
model_lin 40988.6077


# predict and submit

In [98]:
# passengerid
id = meta["col_id"][0]
out = pd.DataFrame(data = test[id], columns = [id])

# target
out_target = model_logreg.predict(test[keep_features])
out[target] = pd.DataFrame(out_target
                          ,columns = [target]
                          ,dtype=np.int32
                          )


# put them out
outfile = "output.csv"
out.to_csv(path_or_buf = outfile,
           index = False)

In [99]:
# Submit
#!kaggle competitions submit {metadata['dataset']} -f {outfile} -m "minimal model"

100%|██████████████████████████████████████| 17.0k/17.0k [00:02<00:00, 6.25kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques

In [100]:
#submit
!kaggle competitions submissions "{metadata['dataset']}"

fileName    date                 description    status    publicScore  privateScore  
----------  -------------------  -------------  --------  -----------  ------------  
output.csv  2021-03-30 22:28:30  minimal model  complete  0.20947      None          
