In [21]:
import warnings
warnings.filterwarnings("ignore")

In [22]:
import numpy as np
import pandas as pd

marketing = pd.read_csv("DirectMarketing.csv")
marketing.head()

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent
0,Old,Female,Own,Single,Far,47500,0,High,6,755
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318
2,Young,Female,Rent,Single,Close,13500,0,Low,18,296
3,Middle,Male,Own,Married,Close,85600,1,High,18,2436
4,Middle,Female,Own,Single,Close,68400,0,High,12,1304


In [2]:
thresh = len(marketing) * 0.6
marketing.dropna(axis=1, thresh=thresh, inplace=True)

In [3]:
low = np.quantile(marketing.Salary, 0.05)
high = np.quantile(marketing.Salary, 0.95)

marketing = marketing[marketing.Salary.between(low, high)]

In [5]:
cols = marketing.select_dtypes(include='object').columns
for col in cols:
    ratio = len(marketing[col].value_counts()) / len(marketing)
    if ratio < 0.05:
        marketing[col] = marketing[col].astype('category')

In [14]:
def drop_missing(df):
    thresh = len(df) * 0.6
    df.dropna(axis=1, thresh=thresh, inplace=True)
    return df

def remove_outliers(df, column_name):
    low = np.quantile(df[column_name], 0.05)
    high = np.quantile(df[column_name], 0.95)
    return df[df[column_name].between(low, high, inclusive=True)]

def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        ratio = len(df[col].value_counts()) / len(df)
        if ratio < 0.05:
            df[col] = df[col].astype('category')
    return df

In [23]:
marketing_cleaned = (marketing.
                       pipe(drop_missing).
                       pipe(remove_outliers, 'Salary').
                       pipe(to_category))

In [24]:
def copy_df(df):
    return df.copy()

marketing_cleaned = (marketing.
                       pipe(copy_df).
                       pipe(drop_missing).
                       pipe(remove_outliers, 'Salary').
                       pipe(to_category))

In [25]:
marketing.shape

(1000, 10)

In [26]:
marketing_cleaned.shape

(900, 10)

In [27]:
marketing.dtypes

Age            object
Gender         object
OwnHome        object
Married        object
Location       object
Salary          int64
Children        int64
History        object
Catalogs        int64
AmountSpent     int64
dtype: object

In [28]:
marketing_cleaned.dtypes

Age            category
Gender         category
OwnHome        category
Married        category
Location       category
Salary            int64
Children          int64
History        category
Catalogs          int64
AmountSpent       int64
dtype: object