In [1]:
import pandas as pd

In [2]:
df = pd.read_excel(r"D:\Ultimate Programming\Data Bases\Machine Learning Datasets\Regression\Property Price Prediction.xlsx")

In [3]:
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5)
memory usage: 1.4 MB


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [6]:
trf = ColumnTransformer(transformers=[
    ('imputer', SimpleImputer(), [4])
], remainder='passthrough')

In [7]:
x = df.iloc[ : , : -1]
y = df['median_house_value']

In [8]:
x_sc = trf.fit_transform(x)
x_sc.shape

(20640, 8)

In [9]:
trf2 = ColumnTransformer(
    transformers=[
        ('scaled', StandardScaler(), slice(0, 8))
    ]
)

In [10]:
x_sc2 = trf2.fit_transform(x_sc)
x_sc2

array([[-0.97522785, -1.32783522,  1.05254828, ..., -0.9744286 ,
        -0.97703285,  2.34476576],
       [ 1.3550882 , -1.32284391,  1.04318455, ...,  0.86143887,
         1.66996103,  2.33223796],
       [-0.82973217, -1.33282653,  1.03850269, ..., -0.82077735,
        -0.84363692,  1.7826994 ],
       ...,
       [-0.12610552, -0.8237132 ,  1.77823747, ..., -0.3695372 ,
        -0.17404163, -1.14259331],
       [-0.30737883, -0.87362627,  1.77823747, ..., -0.60442933,
        -0.39375258, -1.05458292],
       [ 0.18635241, -0.83369581,  1.75014627, ..., -0.03397701,
         0.07967221, -0.78012947]])

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline

In [13]:
models = {
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    'Linear Regression' : LinearRegression(),
    'Decision Tree' : DecisionTreeRegressor(),
    'Random Forest' : RandomForestRegressor(),
    'Support Vector' : SVR()
}

In [14]:
for name, model in models.items():
    pipeline = make_pipeline(trf, trf2, model)
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    print(f" {name} : mse : {mean_squared_error(y_test, y_pred)*100:.2f}")
    print(f" {name} : mae : {mean_absolute_error(y_test, y_pred)*100:.2f}")

 Lasso : mse : 505287634440.21
 Lasso : mse : 5183594.91
 Ridge : mse : 505247737940.71
 Ridge : mse : 5183420.96
 Linear Regression : mse : 505295517459.70
 Linear Regression : mse : 5183614.04
 Decision Tree : mse : 491208359747.72
 Decision Tree : mse : 4463856.25
 Random Forest : mse : 248656051317.88
 Random Forest : mse : 3217801.96
 Support Vector : mse : 1367640301403.27
 Support Vector : mse : 8709830.78


In [15]:
lr = LinearRegression()

In [16]:
pipeline = make_pipeline(trf, trf2, lr)

In [17]:
pipeline.fit(x_train, y_train)

In [19]:
from sklearn import set_config
set_config(display='diagram')

In [20]:
import joblib

In [23]:
joblib.dump(pipeline, 'model_ppp.pkl')

['model_ppp.pkl']