# Outlier detection and Removal using Percentile

In [1]:
import pandas as pd

## 1.Height Dataset

In [2]:
data = pd.read_csv('heights.csv')
data

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9
5,khusbu,5.4
6,dmitry,6.2
7,selena,6.5
8,john,7.1
9,imran,14.5


In [3]:
data.shape

(14, 2)

In [4]:
# Detect outlier using percentile
min_threshold = data['height'].quantile(0.05)
min_threshold

3.6050000000000004

In [5]:
max_threshold = data['height'].quantile(0.95)
max_threshold

9.689999999999998

In [6]:
# Removing outliers
data[(data['height'] > min_threshold) & (data['height'] < max_threshold)].reset_index(drop = True)

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9
5,khusbu,5.4
6,dmitry,6.2
7,selena,6.5
8,john,7.1
9,jose,6.1


## 2. Bangalore property price dataset

In [7]:
df = pd.read_csv('bhp.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250


In [8]:
df.shape

(13200, 7)

In [9]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.337
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1275.0,2.0,71.85,3.0,5438.0
75%,1672.0,3.0,120.0,3.0,7317.0
max,52272.0,40.0,3600.0,43.0,12000000.0


In [10]:
# Explore the samples that are above 99.90% percentile and below 0.1% percentile rank
min_threshold, max_threshold = df['price_per_sqft'].quantile([.001,.999])
min_threshold , max_threshold

(1366.184, 50959.36200000099)

In [11]:
df[df['price_per_sqft'] < min_threshold]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
665,Yelahanka,3 BHK,35000.0,3.0,130.0,3,371
798,other,4 Bedroom,10961.0,4.0,80.0,4,729
1867,other,3 Bedroom,52272.0,2.0,140.0,3,267
2392,other,4 Bedroom,2000.0,3.0,25.0,4,1250
3934,other,1 BHK,1500.0,1.0,19.5,1,1300
5343,other,9 BHK,42000.0,8.0,175.0,9,416
5417,Ulsoor,4 BHK,36000.0,4.0,450.0,4,1250
5597,JP Nagar,2 BHK,1100.0,1.0,15.0,2,1363
7166,Yelahanka,1 Bedroom,26136.0,1.0,150.0,1,573
7862,JP Nagar,3 BHK,20000.0,3.0,175.0,3,875


In [12]:
df[df['price_per_sqft'] > max_threshold]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
345,other,3 Bedroom,11.0,3.0,74.0,3,672727
1005,other,1 BHK,15.0,1.0,30.0,1,200000
1106,other,5 Bedroom,24.0,2.0,150.0,5,625000
4044,Sarjapur Road,4 Bedroom,1.0,4.0,120.0,4,12000000
4924,other,7 BHK,5.0,7.0,115.0,7,2300000
5911,Mysore Road,1 Bedroom,45.0,1.0,23.0,1,51111
6356,Bommenahalli,4 Bedroom,2940.0,3.0,2250.0,4,76530
7012,other,1 BHK,650.0,1.0,500.0,1,76923
7575,other,1 BHK,425.0,1.0,750.0,1,176470
7799,other,4 BHK,2000.0,3.0,1063.0,4,53150


In [16]:
# Remove Outliers
df_new = df[(df['price_per_sqft'] < max_threshold) & (df['price_per_sqft'] > min_threshold)]
df_new.sample(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
6478,Vijayanagar,3 BHK,1760.0,3.0,140.0,3,7954
3588,Yelachenahalli,2 BHK,1080.0,2.0,55.0,2,5092
5442,Old Madras Road,3 BHK,2640.0,5.0,142.0,3,5378
5584,Kasturi Nagar,4 Bedroom,1200.0,4.0,250.0,4,20833
1089,Hoodi,8 Bedroom,1120.0,8.0,145.0,8,12946
5721,Yelahanka,4 Bedroom,1800.0,4.0,180.0,4,10000
5443,Akshaya Nagar,2 BHK,1314.0,2.0,68.8,2,5235
10467,Electronic City,2 BHK,1258.0,2.0,85.5,2,6796
2584,Bannerghatta Road,3 BHK,1550.0,3.0,82.0,3,5290
8915,Kengeri,2 BHK,1052.0,2.0,51.0,2,4847


In [17]:
df_new.shape

(13172, 7)

In [15]:
df_new.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13172.0,13172.0,13172.0,13172.0,13172.0
mean,1537.861049,2.6901,111.591865,2.799651,6663.653735
std,967.123711,1.337026,145.372047,1.29113,4141.0207
min,250.0,1.0,8.0,1.0,1379.0
25%,1100.0,2.0,50.0,2.0,4271.0
50%,1274.5,2.0,71.55,3.0,5438.0
75%,1670.0,3.0,120.0,3.0,7311.0
max,30400.0,40.0,3600.0,43.0,50349.0


## 3. Airbnb NYC 2019 Dataset 

In [19]:
airbnb_data = pd.read_csv('AB_NYC_2019.csv')
airbnb_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [20]:
airbnb_data.shape

(48895, 16)

In [45]:
airbnb_data['price'].describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [46]:
airbnb_data['price'].quantile([.001,.005,.01,0.02,0.03,0.04,0.06])

0.001    18.0
0.005    26.0
0.010    30.0
0.020    35.0
0.030    36.0
0.040    39.0
0.060    42.0
Name: price, dtype: float64

In [48]:
airbnb_data['price'].quantile([.999,.998,.997,.995])

0.999    3000.0
0.998    2000.0
0.997    1500.0
0.995    1000.0
Name: price, dtype: float64

In [51]:
# Explore the samples that are above 99.90% percentile and below 0.5% percentile rank
min_threshold, max_threshold = airbnb_data['price'].quantile([.005,.999])
min_threshold, max_threshold

(26.0, 3000.0)

In [52]:
airbnb_data[airbnb_data['price'] < min_threshold]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price_per_night
957,375249,Enjoy Staten Island Hospitality,1887999,Rimma & Jim,Staten Island,Graniteville,40.62109,-74.16534,Private room,20,3,80,2019-05-26,0.92,1,226,6.666667
2860,1620248,Large furnished 2 bedrooms- - 30 days Minimum,2196224,Sally,Manhattan,East Village,40.73051,-73.98140,Entire home/apt,10,30,0,,,4,137,0.333333
3918,2431607,"Bright, Airy Room Share for 2",4973668,Gloria,Brooklyn,Bedford-Stuyvesant,40.68642,-73.93440,Shared room,25,5,76,2019-06-06,1.22,3,258,5.000000
3950,2459916,"$455 Cozy 1bd, BKLYN Sublet March",12577771,Victor,Brooklyn,Bedford-Stuyvesant,40.68948,-73.93528,Private room,18,1,0,,,1,0,18.000000
4647,3258197,Large 1br Duplex in Heart of Upper East Side,16477306,Jeff,Manhattan,Upper East Side,40.76866,-73.95553,Entire home/apt,16,2,21,2019-06-30,1.69,1,9,8.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48312,36199363,Furnished room for rent in Bronx SINGLE FEMALE,272247972,Kadeen,Bronx,Olinville,40.88116,-73.86547,Shared room,25,90,0,,,1,190,0.277778
48486,36280646,"Cable and wfi, L/G included.",272872092,Chris,Queens,Forest Hills,40.73657,-73.85088,Entire home/apt,16,9,1,2019-07-07,1.00,1,322,1.777778
48832,36450814,FLATBUSH HANG OUT AND GO,267223765,Jarmel,Brooklyn,Flatbush,40.64922,-73.96078,Shared room,20,1,0,,,3,363,20.000000
48867,36473044,The place you were dreaming for.(only for guys),261338177,Diana,Brooklyn,Gravesend,40.59080,-73.97116,Shared room,25,1,0,,,6,338,25.000000


In [54]:
airbnb_data[airbnb_data['price'] > max_threshold]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price_per_night
1862,826690,"Sunny, Family-Friendly 2 Bedroom",4289240,Lucy,Brooklyn,Prospect Heights,40.67919,-73.97191,Entire home/apt,4000,4,0,,,1,83,1000.0
2698,1448703,Beautiful 1 Bedroom in Nolita/Soho,213266,Jessica,Manhattan,Nolita,40.72193,-73.99379,Entire home/apt,5000,1,2,2013-09-28,0.03,1,365,5000.0
3537,2110145,UWS 1BR w/backyard + block from CP,2151325,Jay And Liz,Manhattan,Upper West Side,40.77782,-73.97848,Entire home/apt,6000,14,17,2015-02-17,0.27,1,359,428.571429
3695,2224896,NYC SuperBowl Wk 5 Bdrs River View,11353904,Todd,Manhattan,Upper West Side,40.79476,-73.97299,Entire home/apt,4000,1,0,,,1,0,4000.0
3720,2243699,"SuperBowl Penthouse Loft 3,000 sqft",1483320,Omri,Manhattan,Little Italy,40.71895,-73.99793,Entire home/apt,5250,1,0,,,1,0,5250.0
3774,2271504,SUPER BOWL Brooklyn Duplex Apt!!,11598359,Jonathan,Brooklyn,Clinton Hill,40.68766,-73.96439,Entire home/apt,6500,1,0,,,1,0,6500.0
3788,2281142,Prime NYC Location for Super Bowl,1427243,Jordana,Manhattan,East Village,40.73323,-73.98859,Entire home/apt,3750,1,0,,,1,0,3750.0
4345,2919330,NearWilliamsburg bridge 11211 BK,14908606,Bianca,Brooklyn,Bedford-Stuyvesant,40.69572,-73.95731,Private room,5000,6,10,2016-01-02,0.16,1,363,833.333333
4376,2952861,Photography Location,1177497,Jessica,Brooklyn,Clinton Hill,40.69127,-73.96563,Entire home/apt,4500,1,5,2018-12-29,0.09,11,365,4500.0
4377,2953058,Film Location,1177497,Jessica,Brooklyn,Clinton Hill,40.69137,-73.96723,Entire home/apt,8000,1,1,2016-09-15,0.03,11,365,8000.0


In [56]:
# Remove outliers
new_data = airbnb_data[(airbnb_data['price'] < max_threshold) & (airbnb_data['price'] > min_threshold)]
new_data.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price_per_night
31944,24935048,Surrealist Luxury Loft,11949316,Andrea,Manhattan,Chelsea,40.74903,-73.99198,Entire home/apt,795,3,21,2019-06-19,2.56,1,16,265.0
28662,22186114,Private Room in Brooklyn,16467711,Tori,Brooklyn,Bushwick,40.69743,-73.92171,Private room,75,1,2,2018-01-01,0.11,1,0,75.0
2240,1059531,Staten Island 1st floor Apartment,7875272,Mary,Staten Island,Port Richmond,40.62922,-74.13354,Entire home/apt,221,30,0,,,3,0,7.366667
1654,751851,Spacious 3 bedroom in Park Slope,3010682,Philippa,Brooklyn,Park Slope,40.66777,-73.97781,Entire home/apt,250,10,4,2015-12-28,0.05,1,0,25.0
47016,35546298,★Official & Only 6★Star Airbnb w/ TempurPedic Bed,52062343,Ronald,Bronx,Bronxdale,40.856,-73.86705,Shared room,57,1,4,2019-06-30,4.0,1,340,57.0
12286,9496372,W 4th Ground Floor 1bd Apt,8472477,Scott,Manhattan,West Village,40.73095,-74.00242,Entire home/apt,132,1,1,2016-01-03,0.02,1,0,132.0
47178,35616402,Cozy room for NYC summer rental,152132335,Ursula,Queens,Woodside,40.74164,-73.90289,Private room,100,5,0,,,1,197,20.0
7174,5236234,"1BR, elevator, kitchen, doorman!",12762559,Jeremy,Manhattan,Upper East Side,40.78633,-73.95218,Entire home/apt,150,1,0,,,1,0,150.0
23958,19322801,Romantic art-filled apartment with private yard,21493738,Natalie,Brooklyn,Crown Heights,40.67139,-73.94842,Private room,85,3,0,,,2,0,28.333333
26032,20773149,Nice room across the street from the high bridge.,128692351,Nahuel,Bronx,Highbridge,40.84103,-73.92701,Private room,42,2,62,2019-06-24,2.76,5,1,21.0


In [59]:
new_data['price'].describe()

count    48589.000000
mean       147.774167
std        153.337583
min         27.000000
25%         70.000000
50%        108.000000
75%        175.000000
max       2999.000000
Name: price, dtype: float64