In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

%matplotlib inline


df = pd.read_csv('Blr.csv')
print(df.keys(), "\nTotal No. of features : ",len(df.keys()) - 1)


df = df.filter(['location', 'size','total_sqft', 'price','balcony'])
print("\nFeatures after keeping needed features\n")
print(df.keys())

print("\nFirst 5 rows of New data frame after keeping needed features \n")
print(df.head(6))

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object') 
Total No. of features :  8

Features after keeping needed features

Index(['location', 'size', 'total_sqft', 'price', 'balcony'], dtype='object')

First 5 rows of New data frame after keeping needed features

                   location       size total_sqft   price  balcony
0  Electronic City Phase II      2 BHK       1056   39.07      1.0
1                       NaN  4 Bedroom       2600  120.00      3.0
2               Uttarahalli      3 BHK  1000-1440   62.00      3.0
3        Lingadheeranahalli      3 BHK       1521   95.00      1.0
4                  Kothanur      2 BHK   3456sqft   51.00      1.0
5                Whitefield      2 BHK       1170   38.00      1.0


In [2]:
def format_area(area):
    parameter_type = type(area)
    if parameter_type == type(1.0) or parameter_type == type(1) : return area
    
    token = area.split("-")
    if len(token) == 1 :
        return area 
    else:
        return (float(token[0]) + float(token[1])) / 2

def format_size(size):
    if type(size) == type(1.0) or type(size) == type(1) : return size 
    if not size : return 0 
    ans = 0 
    i = 0 
    
    
    while i < len(size):
        while i < len(size) and size[i].isdigit():
            ans = ans*10 + int(size[i])
            i += 1
        
        i += 1
        
    return ans 
    

df["total_sqft"] = df["total_sqft"].apply(format_area)
df["total_sqft"] = df["total_sqft"].apply(format_size)
df["size"] = df["size"].apply(lambda x : x if type(x) == type(1.0) else float(x.split(" ")[0]))


df.isna().sum()
df = df.dropna()

"""
plt.xlabel(" Price per sqft")
plt.ylabel(" Price ")
plt.scatter(df["pps"][:20] , df["price"][:20] , color='black' ,  marker ='o')
"""


df["pps"] = df['price']*100000 / df["total_sqft"]
print(df.head())
df.shape

                   location  size  total_sqft  price  balcony          pps
0  Electronic City Phase II   2.0      1056.0  39.07      1.0  3699.810606
2               Uttarahalli   3.0      1220.0  62.00      3.0  5081.967213
3        Lingadheeranahalli   3.0      1521.0  95.00      1.0  6245.890861
4                  Kothanur   2.0      3456.0  51.00      1.0  1475.694444
5                Whitefield   2.0      1170.0  38.00      1.0  3247.863248


(12709, 6)

In [4]:

def remv_pps_outliers2_stdDev(df):
    new_df = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.pps)
        st = np.std(subdf.pps)
        reduced_df = subdf[(subdf.pps>(m-st)) & (subdf.pps<=(m+st))]
        new_df = pd.concat([new_df,reduced_df],ignore_index=True)
    return new_df


def remv_pps_outliers_logic(df):
    df_res = pd.DataFrame()
    new_df = df[(df.pps >= 1000) & (df.pps <= 50000)]
    df_res = pd.concat([df_res , new_df] , ignore_index = True)

    return df_res


df2 = remv_pps_outliers_logic(df)
df4 = remv_pps_outliers2_stdDev(df2)


In [5]:
df4.pps.describe()

count     9431.000000
mean      5823.687745
std       2744.064499
min       1250.000000
25%       4251.083985
50%       5200.000000
75%       6500.000000
max      50000.000000
Name: pps, dtype: float64

In [6]:
# similar to sql query : select Location , count(*) from df groupby Location
location_counts = df4.groupby('location')['location'].agg('count')


print("Entries with <= threshold data points ",sum(location_counts[location_counts <= 10]))
rename_as_others = location_counts[location_counts  <=10]
df4.location = df4.location.apply(lambda x : "others" if x in rename_as_others else x)

print("New number of unique Locations After grouping : ",len(df4.location.unique()))


Entries with <= threshold data points  1873
New number of unique Locations After grouping :  184


In [11]:
dummy_locations = pd.get_dummies(df4.location)

df_temp = df4.drop(labels = ["location" , "pps"] , axis = 'columns')
final_df = pd.concat([dummy_locations , df_temp] , axis = "columns")

x = final_df.drop(labels = "price" , axis = 'columns')
y = final_df["price"]

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


x_train, x_test, y_train, y_test = train_test_split(
 x, y, test_size=0.33, random_state=42)


model = LinearRegression()
model.fit(x_train , y_train)

y_predicted = model.predict(x_test) 

from math import sqrt
from sklearn.metrics import mean_squared_error

print("Score ",model.score(x_test , y_test))
print('root Mean squared error: ', math.sqrt(mean_squared_error(y_test, y_predicted)))


Score  0.7381065478237536
root Mean squared error:  47.88265712635865


In [8]:
print("total Entries ",len(df2))
print(x_train.keys())


total Entries  12585
Index([' Devarachikkanahalli', '1st Phase JP Nagar', '2nd Stage Nagarbhavi',
       '5th Phase JP Nagar', '6th Phase JP Nagar', '7th Phase JP Nagar',
       '8th Phase JP Nagar', '9th Phase JP Nagar', 'Abbigere', 'Akshaya Nagar',
       ...
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yeshwanthpur', 'others', 'size', 'total_sqft',
       'balcony'],
      dtype='object', length=187)


In [12]:
def predict(location , size , total_sqft , balcony):
    
    loc_index = x_train.columns.get_loc(location)
    size_index = x_train.columns.get_loc("size")
    sqft_index = x_train.columns.get_loc("total_sqft")
    bal_index = x_train.columns.get_loc("balcony")
    
    x = [0]*len(x_train.columns)

    x[loc_index] = 1
    x[size_index] = size 
    x[sqft_index] = total_sqft 
    x[bal_index] = balcony 
    
    return model.predict([x])[0]


print(predict("Uttarahalli", 3.0 , 1220.0 , 3.0) , " lakhs ")

51.82720184326172  lakhs 
