In [35]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [36]:
df= pd.read_csv('/content/House Price Prediction Dataset.csv')

In [37]:
df.head()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


In [38]:
df.shape

(2000, 10)

In [39]:
df.isnull().sum()

Unnamed: 0,0
Id,0
Area,0
Bedrooms,0
Bathrooms,0
Floors,0
YearBuilt,0
Location,0
Condition,0
Garage,0
Price,0


In [40]:
df['Age']= 2025 - df['YearBuilt']

In [41]:
df['Age']

Unnamed: 0,Age
0,55
1,67
2,87
3,123
4,50
...,...
1995,102
1996,6
1997,122
1998,89


In [42]:
df = df.drop('YearBuilt', axis=1)

In [43]:
df.sample(6)

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,Location,Condition,Garage,Price,Age
448,449,4991,1,2,3,Downtown,Good,No,282275,104
1903,1904,2630,4,1,1,Downtown,Good,No,838525,45
1837,1838,2908,5,3,1,Urban,Good,Yes,778927,111
175,176,1159,4,2,1,Suburban,Good,No,759687,74
200,201,2556,2,4,1,Urban,Fair,No,214340,100
1105,1106,900,1,1,2,Urban,Excellent,No,62875,124


In [44]:
ordinal_encoder = OrdinalEncoder(categories= [['Poor','Fair','Good','Excellent']])
df["Condition"] = ordinal_encoder.fit_transform(df[["Condition"]])

In [45]:
df.sample(3)

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,Location,Condition,Garage,Price,Age
1059,1060,2262,1,1,2,Suburban,1.0,No,805247,48
1102,1103,988,4,4,3,Rural,3.0,No,419432,124
1886,1887,1042,4,3,2,Downtown,1.0,No,264776,102


In [46]:
onehot_encoder = OneHotEncoder(drop = None, sparse_output = False)
encoded_df = onehot_encoder.fit_transform(df[["Garage","Location"]])

In [47]:
encoded_df

array([[1., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.]])

In [48]:
encoded_cols = onehot_encoder.get_feature_names_out(['Garage','Location'])

In [49]:
encoded_cols

array(['Garage_No', 'Garage_Yes', 'Location_Downtown', 'Location_Rural',
       'Location_Suburban', 'Location_Urban'], dtype=object)

In [50]:
encoded_df = pd.DataFrame(encoded_df, columns=encoded_cols, index = df.index)

In [51]:
encoded_df

Unnamed: 0,Garage_No,Garage_Yes,Location_Downtown,Location_Rural,Location_Suburban,Location_Urban
0,1.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1995,1.0,0.0,0.0,0.0,1.0,0.0
1996,0.0,1.0,0.0,0.0,1.0,0.0
1997,1.0,0.0,0.0,1.0,0.0,0.0
1998,0.0,1.0,0.0,0.0,0.0,1.0


In [52]:
df

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,Location,Condition,Garage,Price,Age
0,1,1360,5,4,3,Downtown,3.0,No,149919,55
1,2,4272,5,4,3,Downtown,3.0,No,424998,67
2,3,3592,2,2,3,Downtown,2.0,No,266746,87
3,4,966,4,2,2,Suburban,1.0,Yes,244020,123
4,5,4926,1,4,2,Downtown,1.0,Yes,636056,50
...,...,...,...,...,...,...,...,...,...,...
1995,1996,4994,5,4,3,Suburban,0.0,No,295620,102
1996,1997,3046,5,2,1,Suburban,0.0,Yes,580929,6
1997,1998,1062,5,1,2,Rural,0.0,No,476925,122
1998,1999,4062,3,1,2,Urban,3.0,Yes,161119,89


In [53]:
df.drop(['Garage','Location'], axis=1, inplace=True)

In [54]:
df

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,Condition,Price,Age
0,1,1360,5,4,3,3.0,149919,55
1,2,4272,5,4,3,3.0,424998,67
2,3,3592,2,2,3,2.0,266746,87
3,4,966,4,2,2,1.0,244020,123
4,5,4926,1,4,2,1.0,636056,50
...,...,...,...,...,...,...,...,...
1995,1996,4994,5,4,3,0.0,295620,102
1996,1997,3046,5,2,1,0.0,580929,6
1997,1998,1062,5,1,2,0.0,476925,122
1998,1999,4062,3,1,2,3.0,161119,89


In [55]:
df= pd.concat([df, encoded_df], axis=1)

In [56]:
df

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,Condition,Price,Age,Garage_No,Garage_Yes,Location_Downtown,Location_Rural,Location_Suburban,Location_Urban
0,1,1360,5,4,3,3.0,149919,55,1.0,0.0,1.0,0.0,0.0,0.0
1,2,4272,5,4,3,3.0,424998,67,1.0,0.0,1.0,0.0,0.0,0.0
2,3,3592,2,2,3,2.0,266746,87,1.0,0.0,1.0,0.0,0.0,0.0
3,4,966,4,2,2,1.0,244020,123,0.0,1.0,0.0,0.0,1.0,0.0
4,5,4926,1,4,2,1.0,636056,50,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,4994,5,4,3,0.0,295620,102,1.0,0.0,0.0,0.0,1.0,0.0
1996,1997,3046,5,2,1,0.0,580929,6,0.0,1.0,0.0,0.0,1.0,0.0
1997,1998,1062,5,1,2,0.0,476925,122,1.0,0.0,0.0,1.0,0.0,0.0
1998,1999,4062,3,1,2,3.0,161119,89,0.0,1.0,0.0,0.0,0.0,1.0


In [57]:
df = df.drop('Id', axis=1)

In [58]:
df

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,Condition,Price,Age,Garage_No,Garage_Yes,Location_Downtown,Location_Rural,Location_Suburban,Location_Urban
0,1360,5,4,3,3.0,149919,55,1.0,0.0,1.0,0.0,0.0,0.0
1,4272,5,4,3,3.0,424998,67,1.0,0.0,1.0,0.0,0.0,0.0
2,3592,2,2,3,2.0,266746,87,1.0,0.0,1.0,0.0,0.0,0.0
3,966,4,2,2,1.0,244020,123,0.0,1.0,0.0,0.0,1.0,0.0
4,4926,1,4,2,1.0,636056,50,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,4994,5,4,3,0.0,295620,102,1.0,0.0,0.0,0.0,1.0,0.0
1996,3046,5,2,1,0.0,580929,6,0.0,1.0,0.0,0.0,1.0,0.0
1997,1062,5,1,2,0.0,476925,122,1.0,0.0,0.0,1.0,0.0,0.0
1998,4062,3,1,2,3.0,161119,89,0.0,1.0,0.0,0.0,0.0,1.0


In [61]:
price_df = df.pop('Price')

In [62]:
price_df

Unnamed: 0,Price
0,149919
1,424998
2,266746
3,244020
4,636056
...,...
1995,295620
1996,580929
1997,476925
1998,161119


In [63]:
df

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,Condition,Age,Garage_No,Garage_Yes,Location_Downtown,Location_Rural,Location_Suburban,Location_Urban
0,1360,5,4,3,3.0,55,1.0,0.0,1.0,0.0,0.0,0.0
1,4272,5,4,3,3.0,67,1.0,0.0,1.0,0.0,0.0,0.0
2,3592,2,2,3,2.0,87,1.0,0.0,1.0,0.0,0.0,0.0
3,966,4,2,2,1.0,123,0.0,1.0,0.0,0.0,1.0,0.0
4,4926,1,4,2,1.0,50,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,4994,5,4,3,0.0,102,1.0,0.0,0.0,0.0,1.0,0.0
1996,3046,5,2,1,0.0,6,0.0,1.0,0.0,0.0,1.0,0.0
1997,1062,5,1,2,0.0,122,1.0,0.0,0.0,1.0,0.0,0.0
1998,4062,3,1,2,3.0,89,0.0,1.0,0.0,0.0,0.0,1.0


In [65]:
df = pd.concat([df, price_df], axis =1)

In [67]:
df.head()

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,Condition,Age,Garage_No,Garage_Yes,Location_Downtown,Location_Rural,Location_Suburban,Location_Urban,Price
0,1360,5,4,3,3.0,55,1.0,0.0,1.0,0.0,0.0,0.0,149919
1,4272,5,4,3,3.0,67,1.0,0.0,1.0,0.0,0.0,0.0,424998
2,3592,2,2,3,2.0,87,1.0,0.0,1.0,0.0,0.0,0.0,266746
3,966,4,2,2,1.0,123,0.0,1.0,0.0,0.0,1.0,0.0,244020
4,4926,1,4,2,1.0,50,0.0,1.0,1.0,0.0,0.0,0.0,636056


In [68]:
df.to_csv("PreProcessed_Data.csv", index = False)

In [69]:
preprocessed_df = pd.read_csv('/content/PreProcessed_Data.csv')

In [70]:
preprocessed_df

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,Condition,Age,Garage_No,Garage_Yes,Location_Downtown,Location_Rural,Location_Suburban,Location_Urban,Price
0,1360,5,4,3,3.0,55,1.0,0.0,1.0,0.0,0.0,0.0,149919
1,4272,5,4,3,3.0,67,1.0,0.0,1.0,0.0,0.0,0.0,424998
2,3592,2,2,3,2.0,87,1.0,0.0,1.0,0.0,0.0,0.0,266746
3,966,4,2,2,1.0,123,0.0,1.0,0.0,0.0,1.0,0.0,244020
4,4926,1,4,2,1.0,50,0.0,1.0,1.0,0.0,0.0,0.0,636056
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,4994,5,4,3,0.0,102,1.0,0.0,0.0,0.0,1.0,0.0,295620
1996,3046,5,2,1,0.0,6,0.0,1.0,0.0,0.0,1.0,0.0,580929
1997,1062,5,1,2,0.0,122,1.0,0.0,0.0,1.0,0.0,0.0,476925
1998,4062,3,1,2,3.0,89,0.0,1.0,0.0,0.0,0.0,1.0,161119
