In [559]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

In [560]:
df = pd.read_csv('bengaluru_house_prices.csv')


In [561]:
df = df.drop(['area_type','society','balcony','availability'],axis='columns')

In [562]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [563]:
#Print out the number of missing values in each column
df.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [564]:
df.shape

(13320, 5)

In [565]:
df.drop_duplicates(keep='first', inplace=True)

In [566]:
df[df.isna().any(axis=1)]

Unnamed: 0,location,size,total_sqft,bath,price
56,Devanahalli,4 Bedroom,3010 - 3410,,192.00
81,Hennur Road,4 Bedroom,2957 - 3450,,224.50
224,Devanahalli,3 BHK,1520 - 1740,,74.82
344,Kanakpura Road,1 BHK,525,,21.53
568,,3 BHK,1600,3.0,86.00
...,...,...,...,...,...
11297,Hennur,4 BHK,3484 - 3550,,161.50
11569,Hosur Road,,1350,,8.44
12768,Bettahalsoor,5 Bedroom,3210,,353.00
12861,KR Puram,4 BHK,2204 - 2362,,121.00


In [567]:
df = df.dropna()

In [568]:
def extract_numeric_size(size_str):
    try:
        return int(size_str.split(" ")[0])  # Extract the numeric part before space
    except AttributeError:  # Handling NaN if present
        return size_str

# Apply the function to the entire 'size' column
df['size'] = df['size'].apply(extract_numeric_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = df['size'].apply(extract_numeric_size)


In [569]:
df.shape

(12365, 5)

In [570]:
df.describe()

Unnamed: 0,size,bath,price
count,12365.0,12365.0,12365.0
mean,2.825879,2.719693,115.22923
std,1.324621,1.369955,153.201909
min,1.0,1.0,8.0
25%,2.0,2.0,50.0
50%,3.0,2.0,73.87
75%,3.0,3.0,123.0
max,43.0,40.0,3600.0


In [571]:
# mean_prices = df.groupby('location')['price'].mean()

# # Map location to mean price
# df['location_encoded'] = df['location'].map(mean_prices)

# # Drop the original location column
# df = df.drop(columns=['location'])

# print(df)

In [572]:
df.location = df.location.apply(lambda x: x.strip())
location_stats = df['location'].value_counts(ascending=False)
location_stats

location
Whitefield                502
Sarjapur  Road            357
Electronic City           275
Thanisandra               225
Kanakpura Road            217
                         ... 
1Channasandra               1
Ring Road Nagarbhavi        1
Mango Garden Layout         1
Vijayabank bank layout      1
Abshot Layout               1
Name: count, Length: 1293, dtype: int64

In [573]:
location_stats.values.sum()

12365

In [574]:
len(location_stats[location_stats>10])


231

In [575]:
len(location_stats[location_stats<=10])

1062

In [576]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
BEML Layout               10
Poorna Pragna Layout      10
BTM 1st Stage             10
Nagappa Reddy Layout      10
Dairy Circle              10
                          ..
1Channasandra              1
Ring Road Nagarbhavi       1
Mango Garden Layout        1
Vijayabank bank layout     1
Abshot Layout              1
Name: count, Length: 1062, dtype: int64

In [577]:
df.location = df.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
df = df[df['location']!='other']
len(df.location.unique())

231

In [578]:
def convert_sqft_to_numeric(sqft):
    if any(c.isalpha() for c in sqft):  # Check if there are alphabets
        return np.nan  # Return NaN if there are alphabets
    elif '-' in sqft:
        start, end = map(float, sqft.split(' - '))
        return (start + end) / 2
    else:
        return float(sqft)

# Apply the function to the 'total_sqft' column
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_numeric)

# Drop rows with NaN values
df = df.dropna()
print(df)

                       location  size  total_sqft  bath   price
0      Electronic City Phase II     2      1056.0   2.0   39.07
1              Chikka Tirupathi     4      2600.0   5.0  120.00
2                   Uttarahalli     3      1440.0   2.0   62.00
3            Lingadheeranahalli     3      1521.0   3.0   95.00
4                      Kothanur     2      1200.0   2.0   51.00
...                         ...   ...         ...   ...     ...
13312                 Bellandur     2      1262.0   2.0   47.00
13314         Green Glen Layout     3      1715.0   3.0  112.00
13315                Whitefield     5      3453.0   4.0  231.00
13317     Raja Rajeshwari Nagar     2      1141.0   2.0   60.00
13318           Padmanabhanagar     4      4689.0   4.0  488.00

[9415 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_numeric)


In [579]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price)
        st = np.std(subdf.price)
        reduced_df = subdf[(subdf.price>(m-st)) & (subdf.price<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df = remove_pps_outliers(df)
df.shape

(7875, 5)

In [580]:
df = df[df.bath<df.size+2]

In [581]:
df_cleaned = df.copy()

In [582]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [583]:
dummies = pd.get_dummies(df.location)
# df = pd.concat([df,dummies.drop('other',axis='columns')],axis='columns')
df = pd.concat([df,dummies],axis='columns')
df.head()


Unnamed: 0,location,size,total_sqft,bath,price,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,3,1630.0,3.0,194.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1st Block Jayanagar,6,1200.0,6.0,125.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1st Block Jayanagar,3,1875.0,2.0,235.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1st Block Jayanagar,7,930.0,4.0,85.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1st Block Jayanagar,8,700.0,4.0,104.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [584]:
df = df.drop('location',axis='columns')
df.head(2)

Unnamed: 0,size,total_sqft,bath,price,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,3,1630.0,3.0,194.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,6,1200.0,6.0,125.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [585]:
second_last_column = df.columns[3]
X = df.drop(columns=[second_last_column])
y = df[second_last_column]
print(X)
print(y)

      size  total_sqft  bath  1st Block Jayanagar  1st Phase JP Nagar  \
0        3      1630.0   3.0                 True               False   
1        6      1200.0   6.0                 True               False   
2        3      1875.0   2.0                 True               False   
3        7       930.0   4.0                 True               False   
4        8       700.0   4.0                 True               False   
...    ...         ...   ...                  ...                 ...   
7870     2      1160.0   2.0                False               False   
7871     3      1676.0   3.0                False               False   
7872     3      2503.0   3.0                False               False   
7873     3      1855.0   3.0                False               False   
7874     3      1675.0   3.0                False               False   

      2nd Stage Nagarbhavi  5th Block Hbr Layout  5th Phase JP Nagar  \
0                    False                 False   

In [586]:
print(X)

      size  total_sqft  bath  1st Block Jayanagar  1st Phase JP Nagar  \
0        3      1630.0   3.0                 True               False   
1        6      1200.0   6.0                 True               False   
2        3      1875.0   2.0                 True               False   
3        7       930.0   4.0                 True               False   
4        8       700.0   4.0                 True               False   
...    ...         ...   ...                  ...                 ...   
7870     2      1160.0   2.0                False               False   
7871     3      1676.0   3.0                False               False   
7872     3      2503.0   3.0                False               False   
7873     3      1855.0   3.0                False               False   
7874     3      1675.0   3.0                False               False   

      2nd Stage Nagarbhavi  5th Block Hbr Layout  5th Phase JP Nagar  \
0                    False                 False   

In [587]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

SVR

In [588]:
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# sc_y = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# # Convert y_train to a NumPy array before reshaping
# y_train_array = y_train.to_numpy()
# y_train = sc_y.fit_transform(y_train_array.reshape(-1, 1))

In [589]:
# from sklearn.svm import SVR
# regressor = SVR(kernel = 'rbf')
# regressor.fit(X_train, y_train)

In [590]:
# y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
# np.set_printoptions(precision=2)
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

In [591]:
# from sklearn.metrics import r2_score
# r2_score(y_test, y_pred)

0.59

Random Forest Regressor

In [592]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [593]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

[[180.44 200.  ]
 [ 99.38  95.  ]
 [276.56 260.  ]
 ...
 [ 84.2   96.  ]
 [128.15 150.  ]
 [ 60.8   60.  ]]


In [594]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.760895423829094

0.75

Polynomial Regression

In [595]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [596]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# poly_reg = PolynomialFeatures(degree = 4)
# X_poly = poly_reg.fit_transform(X_train)
# regressor = LinearRegression()
# regressor.fit(X_poly, y_train)

In [597]:
# y_pred = regressor.predict(poly_reg.transform(X_test))
# np.set_printoptions(precision=2)
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

In [598]:
# from sklearn.metrics import r2_score
# r2_score(y_test, y_pred)

-677790479510.8116

multiple linear regression

In [599]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [600]:
# from sklearn.linear_model import LinearRegression
# regressor = LinearRegression()
# regressor.fit(X_train, y_train)

In [601]:
# y_pred = regressor.predict(X_test)
# np.set_printoptions(precision=2)
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

In [602]:
# from sklearn.metrics import r2_score
# r2_score(y_test, y_pred)

0.62


decision tree


In [603]:
# from sklearn.tree import DecisionTreeRegressor
# regressor = DecisionTreeRegressor(random_state = 0)
# regressor.fit(X_train, y_train)

In [604]:
# y_pred = regressor.predict(X_test)
# np.set_printoptions(precision=2)
# # Convert y_test to a NumPy array before reshaping
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

In [605]:
# from sklearn.metrics import r2_score
# r2_score(y_test, y_pred)

0.70