In [356]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler

In [357]:
# Load the original dataset
df = pd.read_csv('../data/customer.csv')
print(df.shape)
print(df.columns)

(72458, 15)
Index(['Unnamed: 0', 'custid', 'sex', 'is_employed', 'income',
       'marital_status', 'health_ins', 'housing_type', 'num_vehicles', 'age',
       'state_of_res', 'code_column', 'gas_usage', 'rooms', 'recent_move_b'],
      dtype='object')


In [358]:
# Check the number of missing values in 'is_employed'
df['is_employed'].value_counts(dropna=False)

is_employed
True     44630
NaN      25515
False     2313
Name: count, dtype: int64

In [359]:
# make is_emplyed categorical with 0 and 1, and replace NaN with 2
# 0 is for unemployed, 1 is for employed, 2 is for people who are not in workforce
df['is_employed'] = df['is_employed'].fillna(False)
df['is_employed'].value_counts()

  df['is_employed'] = df['is_employed'].fillna(False)


is_employed
True     44630
False    27828
Name: count, dtype: int64

In [360]:
# Maximum number of code_column associated with a state_of_res
max(df.groupby('state_of_res')['code_column'].nunique())

1

In [361]:
# Since each state has 1 code, we can drop the 'code_column' feature. We can also drop 'Unnamed: 0' feature
df.drop(['Unnamed: 0','custid','code_column','recent_move_b'], axis=1, inplace=True)
print(df.shape)
print(df.columns)

(72458, 11)
Index(['sex', 'is_employed', 'income', 'marital_status', 'health_ins',
       'housing_type', 'num_vehicles', 'age', 'state_of_res', 'gas_usage',
       'rooms'],
      dtype='object')


In [362]:
df.isnull().sum()

sex                  0
is_employed          0
income               0
marital_status       0
health_ins           0
housing_type      1686
num_vehicles      1686
age                  0
state_of_res         0
gas_usage         1686
rooms                0
dtype: int64

In [363]:
num = df[df.isnull().any(axis=1)].shape[0]
print(print(f'{num} rows have missing values. \nApprox. {num/df.shape[0]*100:.2f}% of the orignal dataset.'))

1686 rows have missing values. 
Approx. 2.33% of the orignal dataset.
None


In [364]:
# Missing values are all in the same rows. We can drop them
df.dropna(inplace=True)
df.shape

(70772, 11)

In [365]:
df[df.age < 21].age.value_counts()

age
0    76
Name: count, dtype: int64

In [366]:
df[df.age < 21]

Unnamed: 0,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,gas_usage,rooms
594,Male,True,50000.0,Never married,False,Rented,1.0,0,Alabama,3.0,3
1260,Male,False,0.0,Married,True,Rented,0.0,0,Arizona,3.0,4
1658,Female,True,24700.0,Never married,True,Rented,3.0,0,Arizona,3.0,5
2340,Female,True,2400.0,Divorced/Separated,True,Rented,0.0,0,Arizona,3.0,4
2859,Female,False,9700.0,Married,True,Homeowner free and clear,3.0,0,Arkansas,3.0,2
...,...,...,...,...,...,...,...,...,...,...,...
67967,Female,False,5000.0,Widowed,True,Homeowner with mortgage/loan,0.0,0,Virginia,3.0,2
68681,Female,True,80000.0,Married,True,Homeowner with mortgage/loan,2.0,0,Virginia,90.0,3
69200,Male,False,0.0,Never married,True,Rented,2.0,0,Washington,3.0,6
70015,Male,True,75000.0,Divorced/Separated,True,Homeowner free and clear,2.0,0,Washington,3.0,4


In [367]:
# For variable 'age', truncate values to 21-99. Values outside this range will be replaced to th closest endpoint.
print(f'Max age: {df['age'].max()} | Min age: {df['age'].min()}')
df['age'] = df['age'].clip(lower=21, upper=99)
print(f'Max age: {df['age'].max()} | Min age: {df['age'].min()}')
print(df.shape)

Max age: 120 | Min age: 0
Max age: 99 | Min age: 21
(70772, 11)


In [368]:
# scale age feature to 0-1 and round to 2 decimal places (people with similar ages will be grouped together)
scaler = MinMaxScaler()
df['age'] = scaler.fit_transform(df[['age']]).round(2)
df['age'].describe()

count    70772.000000
mean         0.361520
std          0.227517
min          0.000000
25%          0.170000
50%          0.350000
75%          0.530000
max          1.000000
Name: age, dtype: float64

In [369]:
# 'income' and 'gas_usage' will be scaled to a normal distribution
scaler = StandardScaler()
df[['income', 'gas_usage']] = scaler.fit_transform(df[['income', 'gas_usage']]).round(2)
df[['income', 'gas_usage']].describe()

Unnamed: 0,income,gas_usage
count,70772.0,70772.0
mean,-0.00011,-0.001815
std,1.00022,1.000975
min,-0.84,-0.64
25%,-0.53,-0.61
50%,-0.26,-0.49
75%,0.16,0.3
max,20.71,8.37


In [370]:
df.head(10)

Unnamed: 0,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,gas_usage,rooms
0,Male,True,-0.35,Never married,True,Homeowner free and clear,0.0,0.04,Alabama,2.67,3
1,Female,False,-0.33,Divorced/Separated,True,Rented,0.0,0.78,Alabama,-0.61,6
2,Female,True,-0.37,Never married,True,Homeowner with mortgage/loan,2.0,0.13,Alabama,-0.02,3
3,Female,False,-0.08,Widowed,True,Homeowner free and clear,1.0,0.92,Alabama,1.25,2
4,Male,True,-0.06,Divorced/Separated,True,Rented,2.0,0.59,Alabama,-0.61,2
5,Male,False,-0.54,Married,True,Homeowner free and clear,2.0,0.71,Alabama,2.51,6
6,Female,True,-0.29,Married,False,Rented,2.0,0.06,Alabama,-0.61,3
7,Female,False,-0.14,Married,True,Homeowner free and clear,2.0,0.67,Alabama,0.14,5
8,Female,True,-0.3,Never married,True,Homeowner free and clear,5.0,0.08,Alabama,-0.61,4
9,Male,True,-0.19,Married,True,Homeowner with mortgage/loan,3.0,0.42,Alabama,-0.34,6


- sex - categorical nominal (binary)
- is_emplyed - categorical nominal (binary)
- income - numerical
- marital_status - categorical nominal (multiclass)
- health_ins - categorical nominal (binary)
- housing_type - categorical nominal (multiclass)
- num_vehicles - numerical
- age - numerical
- state_of_residence - categorical nominal (multiclass)
- gas_usage - numerical
- rooms - numerical

In [371]:
df.health_ins.value_counts(dropna=False)

health_ins
True     64339
False     6433
Name: count, dtype: int64

In [372]:
cols_label_encode = ['sex','is_employed','health_ins', 'state_of_res']
cols_one_hot_encode = ['marital_status', 'housing_type']

label_encoder = LabelEncoder()
for col in cols_label_encode:
    df[col] = label_encoder.fit_transform(df[col])

In [373]:
df = pd.get_dummies(df, columns=cols_one_hot_encode)
df.head()

Unnamed: 0,sex,is_employed,income,health_ins,num_vehicles,age,state_of_res,gas_usage,rooms,marital_status_Divorced/Separated,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner free and clear,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented
0,1,1,-0.35,1,0.0,0.04,0,2.67,3,False,False,True,False,True,False,False,False
1,0,0,-0.33,1,0.0,0.78,0,-0.61,6,True,False,False,False,False,False,False,True
2,0,1,-0.37,1,2.0,0.13,0,-0.02,3,False,False,True,False,False,True,False,False
3,0,0,-0.08,1,1.0,0.92,0,1.25,2,False,False,False,True,True,False,False,False
4,1,1,-0.06,1,2.0,0.59,0,-0.61,2,True,False,False,False,False,False,False,True


In [374]:
encoded_columns = list(filter(lambda x: x.startswith(tuple(cols_one_hot_encode)), df.columns))
encoded_columns

['marital_status_Divorced/Separated',
 'marital_status_Married',
 'marital_status_Never married',
 'marital_status_Widowed',
 'housing_type_Homeowner free and clear',
 'housing_type_Homeowner with mortgage/loan',
 'housing_type_Occupied with no rent',
 'housing_type_Rented']

In [375]:
for col in encoded_columns:
    df[col] = label_encoder.fit_transform(df[col])

df.head()

Unnamed: 0,sex,is_employed,income,health_ins,num_vehicles,age,state_of_res,gas_usage,rooms,marital_status_Divorced/Separated,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner free and clear,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented
0,1,1,-0.35,1,0.0,0.04,0,2.67,3,0,0,1,0,1,0,0,0
1,0,0,-0.33,1,0.0,0.78,0,-0.61,6,1,0,0,0,0,0,0,1
2,0,1,-0.37,1,2.0,0.13,0,-0.02,3,0,0,1,0,0,1,0,0
3,0,0,-0.08,1,1.0,0.92,0,1.25,2,0,0,0,1,1,0,0,0
4,1,1,-0.06,1,2.0,0.59,0,-0.61,2,1,0,0,0,0,0,0,1


In [376]:
# save the cleaned data to a new csv file
df.to_csv('../data/customer_cleaned.csv', index=False)