In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [26]:
rec = pd.DataFrame({
    'CRIM': [3.69],
    'ZN': [11.37],
    'INDUS': [11.15],
    'CHAS': [0.07],
    'NOX': [0.87],
    'RM': [6.29],
    'AGE': [68.91],
    'DIS': [3.77],
    'RAD': [9.50],
    'TAX': [410.95],
    'PTRATIO': [18.37],
    'B': [354.47],
    'LSTAT': [0.79],
})
rec

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,3.69,11.37,11.15,0.07,0.87,6.29,68.91,3.77,9.5,410.95,18.37,354.47,0.79


In [27]:
train = pd.read_csv("../data/housing/housing_train.csv")

In [28]:
s = train.isna().mean()
s[s > 0.7]

NOX      0.879070
LSTAT    0.795349
dtype: float64

In [29]:
train['LSTAT'] = train['LSTAT'].isna()
train['NOX'] = train['NOX'].isna()

In [30]:
X = train.iloc[:, :-1]
y = train['MEDV']

In [31]:
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

In [32]:
reg = LinearRegression()
reg.fit(X, y)

In [33]:
y_pred = reg.predict(X)

In [34]:
reg.predict(rec).round(2)



array([96.67])

In [35]:
rec2 = rec.copy()
rec2.loc[0, 'RM'] += 2

In [39]:
coef = pd.Series(reg.coef_, index=train.columns[:-1])
coef

CRIM       -0.693008
ZN          0.210142
INDUS      -0.159057
CHAS       13.851840
NOX        20.619870
RM         25.280527
AGE        -0.160472
DIS        -5.301457
RAD         1.421446
TAX        -0.057192
PTRATIO    -3.679619
B           0.055114
LSTAT      20.989393
dtype: float64

In [40]:
coef['RM'] * 2

50.5610537224786

In [41]:
reg.predict(rec2) - reg.predict(rec)



array([50.56105372])

## Advanced model

We will check a few models on split dataset and then choose the best model to do the final training before submission.

In [42]:
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
from sklearn.model_selection import train_test_split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022)

In [44]:
models = {
    'linear_regression': LinearRegression(),
    'rf': RandomForestRegressor(),
    'lgbm': LGBMRegressor(),
    'xgb': XGBRFRegressor(),
    
}

In [51]:
naive_mean_mse = mean_squared_error(y_test, np.tile(y_train.mean(), len(y_test))).round(2)
naive_median_mse = mean_squared_error(y_test, np.tile(y_train.median(), len(y_test))).round(2)
print(f"Naive baseline y_train.mean() scores MSE {naive_mean_mse}")
print(f"Naive baseline y_train.median() scores MSE {naive_median_mse}")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    print(f"{name} scores MSE: {score.round(2)}")


Naive baseline y_train.mean() scores MSE 1421.11
Naive baseline y_train.median() scores MSE 1435.55
linear_regression scores MSE: 385.24
rf scores MSE: 338.15
lgbm scores MSE: 299.47
xgb scores MSE: 340.21


In [53]:
train = pd.read_csv("../data/housing/housing_train.csv")
X = train.iloc[:, :-1]
y = train['MEDV']

val = pd.read_csv("../data/housing/housing_validation.csv")
val['LSTAT'] = val['LSTAT'].isna()
val['NOX'] = val['NOX'].isna()

imputer2 = SimpleImputer(strategy="median")

X2 = imputer2.fit_transform(X)
val_t = imputer2.transform(val)

reg2 = LGBMRegressor()
reg2.fit(X2, y)

In [54]:
y_pred = reg2.predict(val_t)
res = pd.Series(y_pred, name='MEDV')
res.to_csv("light_gbm.csv")

In [55]:
with open("model_lgbm_regressor.pkl", 'wb') as f:
    pickle.dump(reg2, f)