# Outliers

Unusual datapoints which different from observation


## Topics covered

---- outlier introduction
---- What is percentile
---- remove outliers from simple dataset
---- remove outlier from complex dataset
---- Exercise

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('D:\\Data Science\\Code basics\\py-master\\ML\\FeatureEngineering\\1_outliers\\heights.csv')

In [3]:
df.head()

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9


# Detect Outlier using percentile

In [5]:
df['height']

0      5.9
1      5.2
2      5.1
3      5.5
4      4.9
5      5.4
6      6.2
7      6.5
8      7.1
9     14.5
10     6.1
11     5.6
12     1.2
13     5.5
Name: height, dtype: float64

In [8]:
max_thresold = df['height'].quantile(0.95)
max_thresold

9.689999999999998

In [10]:
df[df['height']>max_thresold] 
## here we detected the outlier which is more than 95 percentile

Unnamed: 0,name,height
9,imran,14.5


In [12]:
min_thresold = df['height'].quantile(0.05)
min_thresold

3.6050000000000004

In [14]:
df[df['height']<min_thresold]
# Assuming this dataset for adults we can remove the min value

Unnamed: 0,name,height
12,yoseph,1.2


In [15]:
## For removing these outliers

df[(df['height']<max_thresold) & (df['height']> min_thresold)]

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9
5,khusbu,5.4
6,dmitry,6.2
7,selena,6.5
8,john,7.1
10,jose,6.1


## Applying this to complex dataset

In [17]:
df = pd.read_csv('D:\\Data Science\\Code basics\\py-master\\ML\\FeatureEngineering\\1_outliers\\bhp.csv')

In [18]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250


In [19]:
df.shape

(13200, 7)

In [21]:
df.describe()
# we found the max outliers

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.337
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1275.0,2.0,71.85,3.0,5438.0
75%,1672.0,3.0,120.0,3.0,7317.0
max,52272.0,40.0,3600.0,43.0,12000000.0


In [23]:
## supply array to the quantile function
# we want to see and remove outlier using the price per square feet
# based on situaton we can use different thresholds

min_thresold,max_thresold = df.price_per_sqft.quantile([0.001,0.999])
min_thresold,max_thresold

(1366.184, 50959.36200000098)

In [24]:
## Lets see what datapoints have very less value
df[df['price_per_sqft'] < min_thresold]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
665,Yelahanka,3 BHK,35000.0,3.0,130.0,3,371
798,other,4 Bedroom,10961.0,4.0,80.0,4,729
1867,other,3 Bedroom,52272.0,2.0,140.0,3,267
2392,other,4 Bedroom,2000.0,3.0,25.0,4,1250
3934,other,1 BHK,1500.0,1.0,19.5,1,1300
5343,other,9 BHK,42000.0,8.0,175.0,9,416
5417,Ulsoor,4 BHK,36000.0,4.0,450.0,4,1250
5597,JP Nagar,2 BHK,1100.0,1.0,15.0,2,1363
7166,Yelahanka,1 Bedroom,26136.0,1.0,150.0,1,573
7862,JP Nagar,3 BHK,20000.0,3.0,175.0,3,875


In [26]:
## data point above the max threshold

df[df['price_per_sqft']>max_thresold]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
345,other,3 Bedroom,11.0,3.0,74.0,3,672727
1005,other,1 BHK,15.0,1.0,30.0,1,200000
1106,other,5 Bedroom,24.0,2.0,150.0,5,625000
4044,Sarjapur Road,4 Bedroom,1.0,4.0,120.0,4,12000000
4924,other,7 BHK,5.0,7.0,115.0,7,2300000
5911,Mysore Road,1 Bedroom,45.0,1.0,23.0,1,51111
6356,Bommenahalli,4 Bedroom,2940.0,3.0,2250.0,4,76530
7012,other,1 BHK,650.0,1.0,500.0,1,76923
7575,other,1 BHK,425.0,1.0,750.0,1,176470
7799,other,4 BHK,2000.0,3.0,1063.0,4,53150


In [27]:
## removing the outliers by creating a new dataframe
## 1) we are having datapoints less than max threshold
## 2) we are having datapoints more then min threshold 

df2 = df[(df.price_per_sqft < max_thresold) & (df.price_per_sqft > min_thresold)]
df2.shape

(13172, 7)

In [52]:
# sampling the random values
df2.sample(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
12514,other,2 BHK,1057.0,2.0,42.0,2,3973
2430,Rayasandra,7 Bedroom,700.0,6.0,85.0,7,12142
8186,Electronic City,3 BHK,1470.0,2.0,44.1,3,3000
4827,other,5 Bedroom,2400.0,5.0,400.0,5,16666
7175,Kasturi Nagar,2 BHK,1000.0,2.0,58.0,2,5800
6194,Kadugodi,2 BHK,1314.0,2.0,83.0,2,6316
7049,other,3 BHK,1439.0,3.0,57.23,3,3977
257,Ambedkar Nagar,4 Bedroom,2900.0,3.0,300.0,4,10344
4192,other,3 BHK,2000.0,3.0,85.0,3,4250
12326,other,3 Bedroom,600.0,3.0,90.0,3,15000
