In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

url=r"C:\Users\dilee\Downloads\archive\Melbourne_housing_FULL.csv"
house_price=pd.read_csv(url)
house_price

house_price.dropna(subset=["Price"], inplace=True)
# house_price.isna().sum()

y=house_price["Price"]
x=house_price.drop(["Price"], axis=1)

x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=42)

numerical_columns=x.select_dtypes(include=["int64", "float64"]).columns
categorical_columns=x.select_dtypes(include=["object"]).columns

numeric_transformer=Pipeline(steps=[(
    "imputer", SimpleImputer(strategy="median")
)])
categorical_transformer=Pipeline(steps=[(
    "imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")
)])

preprocessing=ColumnTransformer(
    transformers=[
        ("numericals", numeric_transformer, numerical_columns),
        ("categoricals", categorical_transformer, categorical_columns)
    ]
)

linear_model=Pipeline(steps=[
    ("PREPROCESSING",preprocessing),
    ("MODEL", LinearRegression())
])
# LinearRegression()

linear_model.fit(x_train, y_train)
y_prediction=linear_model.predict(x_test)
print("MAE :", mean_absolute_error(y_test, y_prediction))
print("RMSE :", np.sqrt(mean_squared_error(y_test, y_prediction)))
print("R2 :", r2_score(y_test, y_prediction))

MAE : 255007.74240165798
RMSE : 403511.2773174009
R2 : 0.6189617279245645
