    #DECISION TREE REGRESSOR(Pipelines)

In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

In [6]:
df = pd.read_csv("house_prices.csv")

In [7]:
df.head(5)

Unnamed: 0,Size,Bedrooms,Location,Price
0,1660.0,3.0,Rural,351000.0
1,4244.0,4.0,Rural,323000.0
2,3233.0,4.0,Suburb,434000.0
3,1274.0,3.0,Urban,431000.0
4,3534.0,3.0,Suburb,669000.0


In [8]:
df.shape

(3000, 4)

In [9]:
df.info

<bound method DataFrame.info of         Size  Bedrooms Location      Price
0     1660.0       3.0    Rural   351000.0
1     4244.0       4.0    Rural   323000.0
2     3233.0       4.0   Suburb   434000.0
3     1274.0       3.0    Urban   431000.0
4     3534.0       3.0   Suburb   669000.0
...      ...       ...      ...        ...
2995  2167.0       2.0   Suburb   147000.0
2996  1620.0       5.0    Rural   208000.0
2997  3929.0       1.0   Suburb   553000.0
2998  3000.0       5.0    Rural   268000.0
2999  4217.0       3.0   Suburb  1172000.0

[3000 rows x 4 columns]>

In [10]:
print(df.isna().sum())

Size        64
Bedrooms    87
Location     0
Price       29
dtype: int64


In [11]:
df = df.dropna()
print(df.shape)

(2822, 4)


In [12]:
print(df.isna().sum())

Size        0
Bedrooms    0
Location    0
Price       0
dtype: int64


In [13]:
df.dtypes

Size        float64
Bedrooms    float64
Location     object
Price       float64
dtype: object

In [14]:
x = df[["Size","Bedrooms","Location"]]
y = df["Price"]

In [15]:
preprocessor = ColumnTransformer([("location_ohe",OneHotEncoder(handle_unknown = "ignore"),["Location"])],
                                 remainder = "passthrough")

pipeline = Pipeline([
    ("preprocessing",preprocessor),
    ("regressor",DecisionTreeRegressor(random_state = 42 ))])

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)
pipeline.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [17]:
y_pred = pipeline.predict(x_test)
mae = mean_absolute_error(y_test,y_pred)
print(f"Mean Absolute Error on Test Set:${mae:,.2f}")

Mean Absolute Error on Test Set:$361,279.65


In [19]:
new_house = pd.DataFrame([{
    "Size":1660,
    "Bedrooms":3,
    "Location":"Rural"}])

predicted_price = pipeline.predict(new_house)

In [21]:
print(f"predicted price:$ {predicted_price[0]:,.2f}")

predicted price:$ 351,000.00
