# Data Wrangling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import env

In [None]:
database = 'titanic_db'
url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/{database}'
titanic = pd.read_sql('SELECT * FROM passengers', url)

database = 'mall_customers'
url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/{database}'
df = pd.read_sql('SELECT * FROM customers;', url)
df = df.set_index('customer_id')

In [None]:
print('--- Shape: {}'.format(df.shape))
print('--- Info')
df.info()
print('--- Descriptions')
print(df.describe(include='all'))

In [None]:
df.hist(figsize=(24, 10), bins=20)

## Investigate Nulls

In [None]:
# Nulls by column
pd.concat([
    titanic.isna().sum().rename('count'),
    titanic.isna().mean().rename('percent')
], axis=1)

In [None]:
# nulls by row
pd.concat([
    df.isna().sum(axis=1).rename('n_missing'),
    df.isna().mean(axis=1).rename('percent_missing'),
], axis=1).value_counts().sort_index()

## Investigate Outliers

In [None]:
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.
    
    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)
    
    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)
        
    return df

add_upper_outlier_columns(df, k=1.5)

df.head()

In [None]:
outlier_cols = [col for col in df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = df[col][df[col] > 0]
    print(data.describe())