# Preprocessing of testing data

This section will be dedicated to applying the same preprocessing steps to the testing data as we did for the training data. The objective is to obtain a testing dataset that is ready to be used for prediction, made by models trained on the training data.

In [51]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [52]:
# Load the original dataset
df = pd.read_csv('../data/customer_test_masked.csv')
print(df.shape)
print(df.columns)

(804, 15)
Index(['Unnamed: 0', 'custid', 'sex', 'is_employed', 'income',
       'marital_status', 'health_ins', 'housing_type', 'num_vehicles', 'age',
       'state_of_res', 'code_column', 'gas_usage', 'rooms', 'recent_move_b'],
      dtype='object')


In [53]:
# people with missing values in 'is_employed' will be considered as unemployed
df['is_employed'] = df['is_employed'].fillna(False)
df['is_employed'].value_counts()

  df['is_employed'] = df['is_employed'].fillna(False)


is_employed
True     507
False    297
Name: count, dtype: int64

In [54]:
# We can also drop the target column
df.drop(['Unnamed: 0','code_column','recent_move_b', 'health_ins'], axis=1, inplace=True)
print(df.shape)
print(df.columns)

(804, 11)
Index(['custid', 'sex', 'is_employed', 'income', 'marital_status',
       'housing_type', 'num_vehicles', 'age', 'state_of_res', 'gas_usage',
       'rooms'],
      dtype='object')


In [55]:
df.isnull().sum()

custid             0
sex                0
is_employed        0
income             0
marital_status     0
housing_type      34
num_vehicles      34
age                0
state_of_res       0
gas_usage         34
rooms              0
dtype: int64

In [56]:
df['housing_type'].fillna(df['housing_type'].mode()[0], inplace=True)
df['num_vehicles'].fillna(df['num_vehicles'].median(), inplace=True)
df['gas_usage'].fillna(df['gas_usage'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['housing_type'].fillna(df['housing_type'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['num_vehicles'].fillna(df['num_vehicles'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

In [57]:
# For variable 'age', we will truncate values to 21-99. 
# Values outside this range will be replaced to th closest endpoint.
print(f'Max age: {df['age'].max()} | Min age: {df['age'].min()}')
df['age'] = df['age'].clip(lower=21, upper=99)
print(f'Max age: {df['age'].max()} | Min age: {df['age'].min()}')
print(df.shape)

Max age: 114 | Min age: 21
Max age: 99 | Min age: 21
(804, 11)


In [58]:
# Columns to be scaled to min-max range: 'age', 'num_vehicles', 'rooms'
min_max_columns = ['age', 'num_vehicles', 'rooms']
scaler = MinMaxScaler()
# Round the scaled values to 2 decimal places, to group similar values
df[min_max_columns] = scaler.fit_transform(df[min_max_columns]).round(2)
df[min_max_columns].describe()

Unnamed: 0,age,num_vehicles,rooms
count,804.0,804.0,804.0
mean,0.309428,0.324005,0.5
std,0.215016,0.190028,0.345258
min,0.0,0.0,0.0
25%,0.13,0.17,0.2
50%,0.28,0.33,0.4
75%,0.46,0.33,0.8
max,1.0,1.0,1.0


In [59]:
# Columns to be scaled to standard normal distribution: 'income', 'gas_usage'
standard_columns = ['income', 'gas_usage']
scaler = StandardScaler()
df[standard_columns] = scaler.fit_transform(df[['income', 'gas_usage']]).round(2)
df[standard_columns].describe()

Unnamed: 0,income,gas_usage
count,804.0,804.0
mean,-0.000609,-0.001567
std,1.001039,1.001372
min,-0.79,-0.63
25%,-0.63,-0.6
50%,-0.265,-0.6
75%,0.22,0.3
max,9.44,8.11


In [60]:
df.head(10)

Unnamed: 0,custid,sex,is_employed,income,marital_status,housing_type,num_vehicles,age,state_of_res,gas_usage,rooms
0,001115999_01,Male,False,-0.06,Married,Homeowner free and clear,0.17,0.78,Arkansas,-0.27,1.0
1,000566299_01,Male,True,0.22,Never married,Rented,0.17,0.24,New Mexico,0.11,0.8
2,001397329_01,Female,True,4.33,Married,Homeowner with mortgage/loan,0.5,0.42,Colorado,0.87,0.2
3,000843100_01,Female,False,-0.79,Married,Homeowner free and clear,0.17,0.55,California,-0.08,0.2
4,000260071_03,Male,True,0.22,Married,Homeowner with mortgage/loan,0.67,0.18,New Jersey,2.2,0.0
5,000254582_02,Male,True,2.24,Married,Homeowner with mortgage/loan,0.33,0.13,California,-0.61,0.2
6,000916197_03,Female,False,-0.79,Divorced/Separated,Homeowner with mortgage/loan,0.33,0.06,Kentucky,0.49,0.2
7,000295424_04,Female,True,-0.53,Married,Homeowner with mortgage/loan,0.33,0.51,Connecticut,-0.6,0.4
8,001284781_02,Female,False,-0.57,Married,Homeowner free and clear,0.67,0.64,North Carolina,-0.46,0.8
9,001021897_02,Male,True,0.47,Never married,Rented,0.33,0.13,Louisiana,-0.6,1.0


- sex - categorical nominal (binary)
- is_emplyed - categorical nominal (binary)
- income - numerical
- marital_status - categorical nominal (multiclass)
- health_ins - categorical nominal (binary)
- housing_type - categorical nominal (multiclass)
- num_vehicles - numerical
- age - numerical
- state_of_residence - categorical nominal (multiclass)
- gas_usage - numerical
- rooms - numerical

In [61]:
cols_label_encode = ['sex','is_employed', 'state_of_res']
cols_one_hot_encode = ['marital_status', 'housing_type']

In [62]:
label_encoder = LabelEncoder()
for col in cols_label_encode:
    df[col] = label_encoder.fit_transform(df[col])

In [63]:
df = pd.get_dummies(df, columns=cols_one_hot_encode)
df.head()

Unnamed: 0,custid,sex,is_employed,income,num_vehicles,age,state_of_res,gas_usage,rooms,marital_status_Divorced/Separated,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner free and clear,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented
0,001115999_01,1,0,-0.06,0.17,0.78,3,-0.27,1.0,False,True,False,False,True,False,False,False
1,000566299_01,1,1,0.22,0.17,0.24,30,0.11,0.8,False,False,True,False,False,False,False,True
2,001397329_01,0,1,4.33,0.5,0.42,5,0.87,0.2,False,True,False,False,False,True,False,False
3,000843100_01,0,0,-0.79,0.17,0.55,4,-0.08,0.2,False,True,False,False,True,False,False,False
4,000260071_03,1,1,0.22,0.67,0.18,29,2.2,0.0,False,True,False,False,False,True,False,False


In [64]:
dummies = list(filter(lambda x: x.startswith(tuple(cols_one_hot_encode)), df.columns))
dummies

['marital_status_Divorced/Separated',
 'marital_status_Married',
 'marital_status_Never married',
 'marital_status_Widowed',
 'housing_type_Homeowner free and clear',
 'housing_type_Homeowner with mortgage/loan',
 'housing_type_Occupied with no rent',
 'housing_type_Rented']

In [65]:
for col in dummies:
    df[col] = label_encoder.fit_transform(df[col])

df.head()

Unnamed: 0,custid,sex,is_employed,income,num_vehicles,age,state_of_res,gas_usage,rooms,marital_status_Divorced/Separated,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner free and clear,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented
0,001115999_01,1,0,-0.06,0.17,0.78,3,-0.27,1.0,0,1,0,0,1,0,0,0
1,000566299_01,1,1,0.22,0.17,0.24,30,0.11,0.8,0,0,1,0,0,0,0,1
2,001397329_01,0,1,4.33,0.5,0.42,5,0.87,0.2,0,1,0,0,0,1,0,0
3,000843100_01,0,0,-0.79,0.17,0.55,4,-0.08,0.2,0,1,0,0,1,0,0,0
4,000260071_03,1,1,0.22,0.67,0.18,29,2.2,0.0,0,1,0,0,0,1,0,0


In [66]:
df['state_of_res'] = MinMaxScaler().fit_transform(df[['state_of_res']])
df['state_of_res'].describe()

count    804.000000
mean       0.483805
std        0.307417
min        0.000000
25%        0.187500
50%        0.479167
75%        0.770833
max        1.000000
Name: state_of_res, dtype: float64

In [67]:
# save the cleaned data to a new csv file
df.to_csv('../data/to_predict.csv', index=False)

The cleaned dataset is saved in a new file: `to_predict.csv`. It contains the dataset ready to be used for prediction.