![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Cleaning categorical data

For this lab, we will be using the dataset in the Customer Analysis Business Case. This dataset can be found in `files_for_lab` folder. In this lab we will explore categorical data.

## Instructions

### 1. Import the necessary libraries if you are starting a new notebook.
Using the same data as the previous lab: we_fn_use_c_marketing_customer_value_analysis.csv

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from functions import lowercase_cols
import re

In [2]:
customer = pd.read_csv("files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv")
customer

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.431650,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,23405.987980,No,Basic,Bachelor,2/10/11,Employed,M,71941,...,89,0,2,Personal Auto,Personal L1,Offer2,Web,198.234764,Four-Door Car,Medsize
9130,PK87824,California,3096.511217,Yes,Extended,College,2/12/11,Employed,F,21604,...,28,0,1,Corporate Auto,Corporate L3,Offer1,Branch,379.200000,Four-Door Car,Medsize
9131,TD14365,California,8163.890428,No,Extended,Bachelor,2/6/11,Unemployed,M,0,...,37,3,2,Corporate Auto,Corporate L2,Offer1,Branch,790.784983,Four-Door Car,Medsize
9132,UP19263,California,7524.442436,No,Extended,College,2/3/11,Employed,M,21941,...,3,0,3,Personal Auto,Personal L2,Offer3,Branch,691.200000,Four-Door Car,Large


In [3]:
customer = lowercase_cols(customer)
customer.rename(columns = {"employmentstatus":"employment_status"}, inplace = True)
customer.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employment_status', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

### 2. Find  all of the categorical data.

Save it in a categorical_df variable.

In [4]:
categorical_df = customer.select_dtypes("object")
categorical_df

Unnamed: 0,customer,state,response,coverage,education,effective_to_date,employment_status,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,BU79786,Washington,No,Basic,Bachelor,2/24/11,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,QZ44356,Arizona,No,Extended,Bachelor,1/31/11,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,AI49188,Nevada,No,Premium,Bachelor,2/19/11,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,WW63253,California,No,Basic,Bachelor,1/20/11,Unemployed,M,Suburban,Married,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,Medsize
4,HB64268,Washington,No,Basic,Bachelor,2/3/11,Employed,M,Rural,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,No,Basic,Bachelor,2/10/11,Employed,M,Urban,Married,Personal Auto,Personal L1,Offer2,Web,Four-Door Car,Medsize
9130,PK87824,California,Yes,Extended,College,2/12/11,Employed,F,Suburban,Divorced,Corporate Auto,Corporate L3,Offer1,Branch,Four-Door Car,Medsize
9131,TD14365,California,No,Extended,Bachelor,2/6/11,Unemployed,M,Suburban,Single,Corporate Auto,Corporate L2,Offer1,Branch,Four-Door Car,Medsize
9132,UP19263,California,No,Extended,College,2/3/11,Employed,M,Suburban,Married,Personal Auto,Personal L2,Offer3,Branch,Four-Door Car,Large


### 3. Check for NaN values.

In [5]:
categorical_df.isna().sum()

customer             0
state                0
response             0
coverage             0
education            0
effective_to_date    0
employment_status    0
gender               0
location_code        0
marital_status       0
policy_type          0
policy               0
renew_offer_type     0
sales_channel        0
vehicle_class        0
vehicle_size         0
dtype: int64

### 4. Check all unique values of columns.

In [6]:
for col in categorical_df:
    print(col, "-", categorical_df[col].unique())

customer - ['BU79786' 'QZ44356' 'AI49188' ... 'TD14365' 'UP19263' 'Y167826']
state - ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon']
response - ['No' 'Yes']
coverage - ['Basic' 'Extended' 'Premium']
education - ['Bachelor' 'College' 'Master' 'High School or Below' 'Doctor']
effective_to_date - ['2/24/11' '1/31/11' '2/19/11' '1/20/11' '2/3/11' '1/25/11' '1/18/11'
 '1/26/11' '2/17/11' '2/21/11' '1/6/11' '2/6/11' '1/10/11' '1/17/11'
 '1/5/11' '2/27/11' '1/14/11' '1/21/11' '2/5/11' '1/29/11' '2/28/11'
 '2/12/11' '2/2/11' '2/7/11' '1/22/11' '2/13/11' '1/15/11' '1/8/11'
 '1/11/11' '1/28/11' '2/8/11' '2/23/11' '1/2/11' '2/16/11' '1/27/11'
 '1/23/11' '1/9/11' '2/11/11' '2/4/11' '2/1/11' '2/15/11' '2/26/11'
 '1/16/11' '1/1/11' '2/10/11' '1/24/11' '2/25/11' '1/12/11' '2/9/11'
 '1/19/11' '1/4/11' '2/14/11' '2/20/11' '2/18/11' '1/3/11' '1/13/11'
 '1/30/11' '2/22/11' '1/7/11']
employment_status - ['Employed' 'Unemployed' 'Medical Leave' 'Disabled' 'Retired']
gender - ['F' 'M']
location_code

### 5. Check dtypes.

Do they all make sense as categorical data?

In [7]:
categorical_df.dtypes

customer             object
state                object
response             object
coverage             object
education            object
effective_to_date    object
employment_status    object
gender               object
location_code        object
marital_status       object
policy_type          object
policy               object
renew_offer_type     object
sales_channel        object
vehicle_class        object
vehicle_size         object
dtype: object

No, effective to date should be Datetime

In [8]:
categorical_df['effective_to_date'] = pd.to_datetime(categorical_df['effective_to_date']).dt.to_period('m')
categorical_df["effective_to_date"]

0       2011-02
1       2011-01
2       2011-02
3       2011-01
4       2011-02
         ...   
9129    2011-02
9130    2011-02
9131    2011-02
9132    2011-02
9133    2011-02
Name: effective_to_date, Length: 9134, dtype: period[M]

Here we also set the date to a Year-Month format to group the date in only two values

In [9]:
categorical_df["effective_to_date"].value_counts()

2011-01    4898
2011-02    4236
Freq: M, Name: effective_to_date, dtype: int64

### 6. Does any column contain alpha and numeric data?

Decide how to clean it.

The only one that we could clean would be `renew_offer_type`, it looks like an ordinal categorical column, so we are going to get rid of the "Offer" part and just leave the numbers.

In [10]:
categorical_df["renew_offer_type"] = categorical_df["renew_offer_type"].str.strip("Offer")
categorical_df["renew_offer_type"].value_counts()

1    3752
2    2926
3    1432
4    1024
Name: renew_offer_type, dtype: int64

### 7. Would you choose to do anything else to clean or wrangle the categorical data?

Comment your decisions.

In [11]:
# Let's chechk the values

for col in categorical_df:
    print(categorical_df[col].value_counts())
    print("\n")

BU79786    1
PU81096    1
CO75086    1
WW52683    1
XO38850    1
          ..
HS14476    1
YL91587    1
CT18212    1
EW35231    1
Y167826    1
Name: customer, Length: 9134, dtype: int64


California    3150
Oregon        2601
Arizona       1703
Nevada         882
Washington     798
Name: state, dtype: int64


No     7826
Yes    1308
Name: response, dtype: int64


Basic       5568
Extended    2742
Premium      824
Name: coverage, dtype: int64


Bachelor                2748
College                 2681
High School or Below    2622
Master                   741
Doctor                   342
Name: education, dtype: int64


2011-01    4898
2011-02    4236
Freq: M, Name: effective_to_date, dtype: int64


Employed         5698
Unemployed       2317
Medical Leave     432
Disabled          405
Retired           282
Name: employment_status, dtype: int64


F    4658
M    4476
Name: gender, dtype: int64


Suburban    5779
Rural       1773
Urban       1582
Name: location_code, dtype: int64


Married 

We will take care of the `education`, `employment_status`, `policy_type`, `policy` below by grouping and cleaning redundancies.

There is also a case where we would clean the `vehicle_class` column, grouping:
- "Four-Door Car" with "Luxury Car"
- "Two-Door Car" with "Sports Car"
- "SUV" with "Luxury SUV"

But this would depend on the purpose of our analysis.

A more conservative approach would be to group the three lowes count values together in a "High-end" group.

In [12]:
categorical_df["vehicle_class"] = np.where(categorical_df["vehicle_class"].isin(["Luxury Car", "Sports Car", "Luxury SUV"]), "High-end", categorical_df["vehicle_class"])
categorical_df["vehicle_class"].value_counts()

Four-Door Car    4621
Two-Door Car     1886
SUV              1796
High-end          831
Name: vehicle_class, dtype: int64

### 8. Compare policy_type and policy.

What information is contained in these columns.

Can you identify what is important?  

In [13]:
categorical_df["policy_type"].value_counts()

Personal Auto     6788
Corporate Auto    1968
Special Auto       378
Name: policy_type, dtype: int64

In [14]:
categorical_df["policy"].value_counts()

Personal L3     3426
Personal L2     2122
Personal L1     1240
Corporate L3    1014
Corporate L2     595
Corporate L1     359
Special L2       164
Special L3       148
Special L1        66
Name: policy, dtype: int64

There is a lot of duplicate data in these columns, "Auto" is a redundant word for the `policy_type` and in the `policy` column we only need L1-3 information

In [15]:
categorical_df["policy_type"] = categorical_df["policy_type"].str.strip("Auto")
categorical_df["policy_type"].value_counts()

Personal      6788
Corporate     1968
Special        378
Name: policy_type, dtype: int64

In [16]:
def last_two(x):
    x = x[-2:]
    return x

categorical_df["policy"] = categorical_df["policy"].apply(last_two)
categorical_df["policy"].value_counts()

L3    4588
L2    2881
L1    1665
Name: policy, dtype: int64

### 9. Check number of unique values in each column.

Can they be combined in any way to ease encoding?

Comment your thoughts and make those changes.

In [17]:
for col in categorical_df:
    print(col, "-", categorical_df[col].nunique())

customer - 9134
state - 5
response - 2
coverage - 3
education - 5
effective_to_date - 2
employment_status - 5
gender - 2
location_code - 3
marital_status - 3
policy_type - 3
policy - 3
renew_offer_type - 4
sales_channel - 4
vehicle_class - 4
vehicle_size - 3


"Master" and "Doctor" can be grouped in a common Postgraduate group.

In [18]:
categorical_df["education"].value_counts()

Bachelor                2748
College                 2681
High School or Below    2622
Master                   741
Doctor                   342
Name: education, dtype: int64

In [19]:
categorical_df["education"] = np.where(categorical_df["education"].isin(["Master", "Doctor"]), "Postgraduate", categorical_df["education"])
categorical_df["education"].value_counts()

Bachelor                2748
College                 2681
High School or Below    2622
Postgraduate            1083
Name: education, dtype: int64

"Medical Leave", "Disabled", "Retired" can also be grouped to an "Excempt" group.

In [20]:
categorical_df["employment_status"].value_counts()

Employed         5698
Unemployed       2317
Medical Leave     432
Disabled          405
Retired           282
Name: employment_status, dtype: int64

In [21]:
categorical_df["employment_status"] = np.where(categorical_df["employment_status"].isin(["Medical Leave", "Disabled", "Retired"]), "Excempt", categorical_df["employment_status"])
categorical_df["employment_status"].value_counts()

Employed      5698
Unemployed    2317
Excempt       1119
Name: employment_status, dtype: int64

In [22]:
for col in categorical_df:
    print(col, "-", categorical_df[col].nunique())

customer - 9134
state - 5
response - 2
coverage - 3
education - 4
effective_to_date - 2
employment_status - 3
gender - 2
location_code - 3
marital_status - 3
policy_type - 3
policy - 3
renew_offer_type - 4
sales_channel - 4
vehicle_class - 4
vehicle_size - 3
