In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score


In [9]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [10]:
features = ["YearBuilt", "GrLivArea", "OverallQual", "GarageCars", "TotalBsmtSF"]

train_df = train_df[features + ["SalePrice"]].dropna()

X = train_df[features]
y = train_df["SalePrice"]


In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=15
)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)
rmse = rmse = mean_squared_error(y_test, y_pred) ** 0.5


rmse


39763.29526578059

In [17]:
median_price = train_df["SalePrice"].median()
train_df["BuyHouse"] = (train_df["SalePrice"] > median_price).astype(int)

y_clf = train_df["BuyHouse"]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_clf, test_size=0.2, random_state=42
)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy


0.9246575342465754

In [20]:
train_out = train_df.copy()
train_out.loc[train_out.index[:10], "GrLivArea"] *= 5

X_out = scaler.fit_transform(train_out[features])
y_out = train_out["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(
    X_out, y_out, test_size=0.2, random_state=42
)

lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

rmse_outliers = mean_squared_error(y_test, y_pred) ** 0.5

rmse_outliers


43746.57130437883

In [21]:
cv_rmse = -cross_val_score(
    lin_reg,
    X_scaled,
    y,
    scoring="neg_root_mean_squared_error",
    cv=5
)

cv_rmse.mean()


np.float64(38698.783263832906)