In [68]:
import pandas as pd

In [69]:
# Load the original dataset
df = pd.read_csv('../data/customer.csv')
print(df.shape)
print(df.columns)

(72458, 15)
Index(['Unnamed: 0', 'custid', 'sex', 'is_employed', 'income',
       'marital_status', 'health_ins', 'housing_type', 'num_vehicles', 'age',
       'state_of_res', 'code_column', 'gas_usage', 'rooms', 'recent_move_b'],
      dtype='object')


In [70]:
# Check the number of missing values in 'is_employed'
df['is_employed'].value_counts(dropna=False)

is_employed
True     44630
NaN      25515
False     2313
Name: count, dtype: int64

In [71]:
# make is_emplyed categorical with 0 and 1, and replace NaN with 2
# 0 is for unemployed, 1 is for employed, 2 is for people who are not in workforce
df['is_employed'] = df['is_employed'].map({True: 1, False: 0}).fillna(2)
df['is_employed'].value_counts()

is_employed
1.0    44630
2.0    25515
0.0     2313
Name: count, dtype: int64

In [72]:
df.groupby('state_of_res')['code_column'].nunique()

state_of_res
Alabama                 1
Alaska                  1
Arizona                 1
Arkansas                1
California              1
Colorado                1
Connecticut             1
Delaware                1
District of Columbia    1
Florida                 1
Georgia                 1
Hawaii                  1
Idaho                   1
Illinois                1
Indiana                 1
Iowa                    1
Kansas                  1
Kentucky                1
Louisiana               1
Maine                   1
Maryland                1
Massachusetts           1
Michigan                1
Minnesota               1
Mississippi             1
Missouri                1
Montana                 1
Nebraska                1
Nevada                  1
New Hampshire           1
New Jersey              1
New Mexico              1
New York                1
North Carolina          1
North Dakota            1
Ohio                    1
Oklahoma                1
Oregon                  1

In [73]:
# Since each state has 1 code, we can drop the 'code_column' feature. We can also drop 'Unnamed: 0' feature
df.drop(['Unnamed: 0', 'code_column'], axis=1, inplace=True)
print(df.shape)
print(df.columns)

(72458, 13)
Index(['custid', 'sex', 'is_employed', 'income', 'marital_status',
       'health_ins', 'housing_type', 'num_vehicles', 'age', 'state_of_res',
       'gas_usage', 'rooms', 'recent_move_b'],
      dtype='object')


In [74]:
df.isnull().sum()

custid               0
sex                  0
is_employed          0
income               0
marital_status       0
health_ins           0
housing_type      1686
num_vehicles      1686
age                  0
state_of_res         0
gas_usage         1686
rooms                0
recent_move_b     1687
dtype: int64

In [75]:
num = df[df.isnull().any(axis=1)].shape[0]
print(print(f'{num} rows have missing values. \nApprox. {num/df.shape[0]*100:.2f}% of the orignal dataset.'))

1687 rows have missing values. 
Approx. 2.33% of the orignal dataset.
None


In [76]:
# Missing values are all in the same rows. We can drop them
df.dropna(inplace=True)
df.shape

(70771, 13)

In [77]:
# Filter out the outliers

columns_with_outliers = ['age', 'income', 'gas_usage']

for col in columns_with_outliers:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    print(f'{col}: {df[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0]} outliers')
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]


df.shape

age: 187 outliers
income: 4507 outliers
gas_usage: 5183 outliers


(60894, 13)

In [78]:
# save the cleaned data to a new csv file
df.to_csv('../data/customer_cleaned.csv', index=False)