Outliers can be detected using two methods:
    
    1. Domain Knowledge: Setting the range threshold directly
    2. Quantile Method: Setting the percentage threshold

In [1]:
import pandas as pd
df = pd.read_csv('bhp.csv')
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


In [2]:
df.shape

(13200, 7)

In [4]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.337
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1275.0,2.0,71.85,3.0,5438.0
75%,1672.0,3.0,120.0,3.0,7317.0
max,52272.0,40.0,3600.0,43.0,12000000.0


__Detecting Outliers using quantile method__

In [6]:
min_threshold, max_threshold = df['price_per_sqft'].quantile([0.001,0.999]) # thresholds at 0.1 % and 99.90 %
min_threshold, max_threshold

(1366.184, 23674.45000000001)

In [8]:
df[df['price_per_sqft']<min_threshold] #All values in dataframe below the min threshold

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
665,Yelahanka,3 BHK,35000.0,3.0,130.0,3,371
798,other,4 Bedroom,10961.0,4.0,80.0,4,729
1867,other,3 Bedroom,52272.0,2.0,140.0,3,267
2392,other,4 Bedroom,2000.0,3.0,25.0,4,1250
3934,other,1 BHK,1500.0,1.0,19.5,1,1300
5343,other,9 BHK,42000.0,8.0,175.0,9,416
5417,Ulsoor,4 BHK,36000.0,4.0,450.0,4,1250
5597,JP Nagar,2 BHK,1100.0,1.0,15.0,2,1363
7166,Yelahanka,1 Bedroom,26136.0,1.0,150.0,1,573
7862,JP Nagar,3 BHK,20000.0,3.0,175.0,3,875


In [9]:
df[df['price_per_sqft']>max_threshold] #All values in dataframe above the min threshold

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333
190,Bellandur,4 Bedroom,1200.0,5.0,325.0,4,27083
345,other,3 Bedroom,11.0,3.0,74.0,3,672727
733,Cunningham Road,4 BHK,5270.0,4.0,1250.0,4,23719
...,...,...,...,...,...,...,...
13081,other,6 Bedroom,8000.0,6.0,2800.0,6,35000
13094,other,4 Bedroom,1200.0,5.0,325.0,4,27083
13127,other,4 Bedroom,1200.0,5.0,325.0,4,27083
13185,Hulimavu,1 BHK,500.0,1.0,220.0,1,44000


In [23]:
df_no_outliers = df[(df['price_per_sqft']>min_threshold) & (df['price_per_sqft']<max_threshold)]
df_no_outliers

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


In [25]:
df_no_outliers.sample(10) #prints any 10 random samples

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
11395,Kumaraswami Layout,4 Bedroom,600.0,2.0,72.0,4,12000
8146,7th Phase JP Nagar,3 BHK,1790.0,3.0,120.0,3,6703
10971,Banashankari,3 BHK,1800.0,3.0,125.0,3,6944
3630,Bhoganhalli,2 BHK,1447.0,2.0,75.97,2,5250
6394,Dasanapura,2 BHK,814.0,2.0,42.0,2,5159
11463,Chandapura,2 BHK,900.0,2.0,35.0,2,3888
8004,Electronic City,2 BHK,1025.0,1.0,26.63,2,2598
2039,Indira Nagar,3 BHK,1650.0,3.0,200.0,3,12121
12388,other,3 BHK,1720.0,3.0,95.0,3,5523
7542,other,1 Bedroom,660.0,1.0,95.0,1,14393
