In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv(r"C:\Users\abdul\Desktop\Pandas Practices\housing_price_dataset.csv")

In [5]:
df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


In [6]:
df.dtypes

SquareFeet        int64
Bedrooms          int64
Bathrooms         int64
Neighborhood     object
YearBuilt         int64
Price           float64
dtype: object

In [8]:
df['Price'] = df['Price'].astype(int)

In [9]:
df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355
1,2459,3,2,Rural,1980,195014
2,1860,2,1,Suburb,1970,306891
3,2294,2,1,Urban,1996,206786
4,2130,5,2,Suburb,2001,272436
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080
49996,2854,2,2,Suburb,1988,374507
49997,2979,5,3,Suburb,1962,384110
49998,2596,5,2,Rural,1984,380512


In [11]:
X = df.drop('Price',axis=1)
y = df['Price']

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
numeric_features = ['SquareFeet','Bedrooms','Bathrooms','YearBuilt']
categorial_features = ['Neighborhood']

In [44]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [52]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorial_features)])

In [53]:
model = Pipeline(steps=[("preprocessor",preprocessor),("regression",RandomForestRegressor())])

In [54]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [55]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 2811257973.180499


In [56]:
y_pred

array([222595.38, 159432.36, 272772.39, ..., 289046.46, 153678.23,
       224235.18], shape=(15000,))

In [57]:
comparison_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred,
})

In [58]:
comparison_df

Unnamed: 0,Actual,Predicted
0,170835,222595.38
1,126913,159432.36
2,246611,272772.39
3,244250,329148.70
4,271127,269114.61
...,...,...
14995,233518,286903.73
14996,183539,145997.41
14997,357100,289046.46
14998,229644,153678.23


In [59]:
comparison_df["Error"] = comparison_df["Actual"] - comparison_df["Predicted"]
comparison_df["Absolute Error"] = abs(comparison_df["Error"])
comparison_df["% Error"] = (comparison_df["Absolute Error"] / comparison_df["Actual"]) * 100
comparison_df["% Accuracy"] = 100 - comparison_df["% Error"]

In [60]:
comparison_df

Unnamed: 0,Actual,Predicted,Error,Absolute Error,% Error,% Accuracy
0,170835,222595.38,-51760.38,51760.38,30.298463,69.701537
1,126913,159432.36,-32519.36,32519.36,25.623348,74.376652
2,246611,272772.39,-26161.39,26161.39,10.608363,89.391637
3,244250,329148.70,-84898.70,84898.70,34.758936,65.241064
4,271127,269114.61,2012.39,2012.39,0.742232,99.257768
...,...,...,...,...,...,...
14995,233518,286903.73,-53385.73,53385.73,22.861505,77.138495
14996,183539,145997.41,37541.59,37541.59,20.454285,79.545715
14997,357100,289046.46,68053.54,68053.54,19.057278,80.942722
14998,229644,153678.23,75965.77,75965.77,33.079797,66.920203


In [61]:
comparison_df.describe()

Unnamed: 0,Actual,Predicted,Error,Absolute Error,% Error,% Accuracy
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,224885.139933,224780.19882,104.941113,42318.397916,23.360596,76.639404
std,76025.877255,60713.602244,53022.960965,31944.940178,177.737857,177.737857
min,-18159.0,44619.81,-188818.55,0.48,-19465.760331,-5514.583051
25%,170492.25,174668.033929,-36012.855,16774.8475,7.423435,71.732768
50%,225075.5,224218.49,18.455,35728.095,15.95711,84.04289
75%,279335.0,275021.9275,35507.6575,60960.0875,28.267232,92.576565
max,482577.0,408626.55,204070.29,204070.29,5614.583051,19565.760331


In [62]:
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error (MSE):", round(mse, 2))
print("Average % Accuracy:", round(comparison_df['% Accuracy'].mean(), 2), "%")


Mean Squared Error (MSE): 2811257973.18
Average % Accuracy: 76.64 %
