## Creating And Cleaning Features: Cap And Floor Data To Remove Outliers

### Read In Data

In [1]:
# Read in data
import pandas as pd
import numpy as np

titanic = pd.read_csv('../../../data/titanic_no_missing.csv')
titanic.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Embarked_clean
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,S


### Remove Outliers

In [2]:
# See where outliers might be an issue
titanic.describe()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age_clean
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,29.699118
std,257.353842,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,13.002015
min,0.0,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.42
25%,222.5,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,22.0
50%,445.0,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,29.699118
75%,667.5,668.5,1.0,3.0,38.0,1.0,0.0,31.0,35.0
max,890.0,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,80.0


In [5]:
def detect_outlier(feature):
    outliers = []
    data = titanic[feature]
    mean = np.mean(data)
    std =np.std(data)
    
    
    for y in data:
        z_score= (y - mean)/std 
        if np.abs(z_score) > 3:
            outliers.append(y)
    print('\nOutlier caps for {}:'.format(feature))
    print('  --95p: {:.1f} / {} values exceed that'.format(data.quantile(.95),
                                                             len([i for i in data
                                                                  if i > data.quantile(.95)])))
    print('  --3sd: {:.1f} / {} values exceed that'.format(mean + 3*(std), len(outliers)))
    print('  --99p: {:.1f} / {} values exceed that'.format(data.quantile(.99),
                                                             len([i for i in data
                                                                  if i > data.quantile(.99)])))

In [6]:
# Determine what the upperbound should be for continuous features
for feat in ['Age_clean', 'SibSp', 'Parch', 'Fare']:
    detect_outlier(feat)


Outlier caps for Age_clean:
  --95p: 54.0 / 42 values exceed that
  --3sd: 68.7 / 7 values exceed that
  --99p: 65.0 / 8 values exceed that

Outlier caps for SibSp:
  --95p: 3.0 / 30 values exceed that
  --3sd: 3.8 / 30 values exceed that
  --99p: 5.0 / 7 values exceed that

Outlier caps for Parch:
  --95p: 2.0 / 15 values exceed that
  --3sd: 2.8 / 15 values exceed that
  --99p: 4.0 / 6 values exceed that

Outlier caps for Fare:
  --95p: 112.1 / 45 values exceed that
  --3sd: 181.2 / 20 values exceed that
  --99p: 249.0 / 9 values exceed that


In [12]:
# Cap features
# Capando valores que estao acima de 99% dos dados 
titanic['Age_clean'].clip(upper=titanic['Age_clean'].quantile(.99), inplace=True)
titanic['Fare_clean'] = titanic['Fare'].clip(upper=titanic['Fare'].quantile(.99))

In [13]:
# Describe the dataframe again to make sure the capping was successful
titanic.describe()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age_clean,Fare_clean
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0
mean,445.0,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,29.640195,31.224767
std,257.353842,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,12.820616,42.524125
min,0.0,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.42,0.0
25%,222.5,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,22.0,7.9104
50%,445.0,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,29.699118,14.4542
75%,667.5,668.5,1.0,3.0,38.0,1.0,0.0,31.0,35.0,31.0
max,890.0,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,65.0,249.00622


In [None]:
# Write out capped data
titanic.to_csv('../../../data/titanic_capped.csv', index=False)