In [234]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier

In [235]:
def preprocess(data, isTest = False):
    # is_employed null values are filled with False
    # data['is_employed'] = data['is_employed'].fillna(False)

    # drop columns 
    cols_to_drop = ['Unnamed: 0','custid','code_column']
    if isTest:
        cols_to_drop.append('health_ins')
    data = data.drop(cols_to_drop, axis=1)

    # put ages < 21 and > 99 to Nan
    data['age'] = data['age'].apply(lambda x: x if 21 <= x <= 99 else None)
    
    # min max scaling
    min_max_columns = ['age']
    minmax_scaler = MinMaxScaler()
    data[min_max_columns] = minmax_scaler.fit_transform(data[min_max_columns]).round(4)*10
    
    # log normalization
    log_columns = ['income', 'gas_usage']
    data[log_columns] = data[log_columns].apply(lambda x: np.log1p(x - x.min() + 1)).round(4)
    
    # label encoding
    cols_label_encode = ['sex','is_employed', 'state_of_res']
    if not isTest:
        cols_label_encode.append('health_ins')
    label_encoder = LabelEncoder()
    for col in cols_label_encode:
        data[col] = label_encoder.fit_transform(data[col])

    # one hot encoding
    cols_one_hot_encode = ['marital_status', 'housing_type', 'is_employed']
    data = pd.get_dummies(data, columns=cols_one_hot_encode, dummy_na=False)
    dummies = list(filter(lambda x: x.startswith(tuple(cols_one_hot_encode)), data.columns))
    for col in dummies:
        data[col] = label_encoder.fit_transform(data[col])
    
    # scale the state_of_res column
    data['state_of_res'] = MinMaxScaler().fit_transform(data[['state_of_res']])

    # encode recent_move_b with label encoder but let the nan values as they are
    data['recent_move_b'] = label_encoder.fit_transform(data['recent_move_b'])
    data['recent_move_b'] = data['recent_move_b'].replace(2, None)

    knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
    df_imputed = knn_imputer.fit_transform(data)
    df_imputed = pd.DataFrame(df_imputed, columns=data.columns)
    return df_imputed

In [236]:
os.chdir('../data')

In [237]:
df = pd.read_csv('customer.csv')
df = preprocess(df)
df.head()

Unnamed: 0,sex,income,health_ins,num_vehicles,age,state_of_res,gas_usage,rooms,recent_move_b,marital_status_Divorced/Separated,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner free and clear,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented,is_employed_0,is_employed_1,is_employed_2
0,1,10.2717,1,0.0,0.4,0.0,5.3519,3,0,0,0,1,0,1,0,0,0,0,1,0
1,0,10.3123,1,0.0,8.133,0.0,1.3863,6,1,1,0,0,0,0,0,0,1,0,0,1
2,0,10.2365,1,2.0,1.333,0.0,3.7136,3,0,0,0,1,0,0,1,0,0,0,1,0
3,0,10.7071,1,1.0,9.6,0.0,4.7958,2,0,0,0,0,1,1,0,0,0,0,0,1
4,1,10.7343,1,2.0,6.133,0.0,1.3863,2,0,1,0,0,0,0,0,0,1,0,1,0


In [238]:
df.isnull().sum()

sex                                          0
income                                       0
health_ins                                   0
num_vehicles                                 0
age                                          0
state_of_res                                 0
gas_usage                                    0
rooms                                        0
recent_move_b                                0
marital_status_Divorced/Separated            0
marital_status_Married                       0
marital_status_Never married                 0
marital_status_Widowed                       0
housing_type_Homeowner free and clear        0
housing_type_Homeowner with mortgage/loan    0
housing_type_Occupied with no rent           0
housing_type_Rented                          0
is_employed_0                                0
is_employed_1                                0
is_employed_2                                0
dtype: int64

In [239]:
X, y = df.drop('health_ins', axis=1), df['health_ins']
X, y = ADASYN().fit_resample(X, y)
X.shape, y.shape

((128805, 19), (128805,))

In [240]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)

In [241]:
df_to_predict = pd.read_csv('customer_test_masked.csv')
df_to_predict.head()

Unnamed: 0.1,Unnamed: 0,custid,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,code_column,gas_usage,rooms,recent_move_b
0,4523,001115999_01,Male,,28900.0,Married,,Homeowner free and clear,1.0,82,Arkansas,653,20.0,6,F
1,58780,000566299_01,Male,True,40000.0,Never married,,Rented,1.0,40,New Mexico,404,40.0,5,T
2,18628,001397329_01,Female,True,203000.0,Married,,Homeowner with mortgage/loan,3.0,54,Colorado,1291,80.0,2,F
3,11525,000843100_01,Female,,0.0,Married,,Homeowner free and clear,1.0,64,California,8962,30.0,2,F
4,56266,000260071_03,Male,True,40000.0,Married,,Homeowner with mortgage/loan,4.0,35,New Jersey,2059,150.0,1,F


In [242]:
ids = df_to_predict['custid']
df_to_predict = preprocess(df_to_predict, isTest=True)
df_to_predict.head()

Unnamed: 0,sex,income,num_vehicles,age,state_of_res,gas_usage,rooms,recent_move_b,marital_status_Divorced/Separated,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner free and clear,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented,is_employed_0,is_employed_1,is_employed_2
0,1.0,10.2717,1.0,8.243,0.0625,3.0445,6.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,10.5967,1.0,2.568,0.625,3.7136,5.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,12.221,3.0,4.459,0.104167,4.3944,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.6931,1.0,5.811,0.083333,3.434,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,10.5967,4.0,1.892,0.604167,5.0173,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [243]:
predictions = model.predict(df_to_predict)
predictions = ['TRUE' if p == 1 else 'FALSE' for p in predictions]
output = pd.DataFrame({'custid': ids, 'health_ins': predictions})
output

Unnamed: 0,custid,health_ins
0,001115999_01,TRUE
1,000566299_01,FALSE
2,001397329_01,TRUE
3,000843100_01,TRUE
4,000260071_03,TRUE
...,...,...
799,000961606_04,FALSE
800,001063357_01,TRUE
801,000613999_02,TRUE
802,000087463_03,TRUE


In [244]:
pd.DataFrame(output).to_csv('predictions.csv', index=False)