## In this notebook we will be creating functions to run the preprocessing in the validation/Testing set and to execute the model on the processed data

1. Create a function to preprocess the data
2. Create a function to run the model on the data

In [83]:
import pandas as pd
import os
import pickle
import numpy as np

In [84]:
#### Load the required files

with open('column_encoder.pickle', 'rb') as f:
  encoder = pickle.load(f)

with open('column_unique_values.pickle', 'rb') as f:
  uniq_values = pickle.load(f)

with open('columns_to_remove.pickle', 'rb') as f:
  cols_to_remove = pickle.load(f)

with open('missing_value_replacement.pickle', 'rb') as f:
  missing_replacement = pickle.load(f)

with open('column_sequence.pickle', 'rb') as f:
  column_seq = pickle.load(f)

In [85]:
### Initialize the required predefined functions

def children_numeric_converter(children_data, unique_values):
  children_data[~(children_data.isin(unique_values))] = "Zero" # This replaces every other value to Zero
  children_data[children_data=='Zero'] = 0  # Converts Zero to numeric 0
  children_data[children_data=='4+'] = 4    # Converts 4+ to 4
  children_data = pd.to_numeric(children_data) # Converts children_data to numeric
  return children_data

def ageband_numeric_converter(ageband_data, uniq_values):

  def __split(x):
    splits = x.split('-')
    left_val = splits[0]
    right_val = splits[1]
    mean = (float(left_val) + float(right_val))/2
    return mean

  ageband_data[~(ageband_data.isin(uniq_values))] = '45-50'
  ageband_data[ageband_data=='71+'] = '71-75'
  ageband_data[ageband_data=='Unknown'] = '45-50'
  mean_vals = ageband_data.apply(__split)
  return mean_vals

def gender_numeric_converter(gender_data, uniq_values):
  gender_data[~(gender_data.isin(uniq_values))] = 'Female'
  gender_data[gender_data=='Unknown'] = 'Female'
  gender_data[gender_data=='Male'] = 0
  gender_data[gender_data=='Female'] = 1
  gender_data = pd.to_numeric(gender_data)
  return gender_data

In [86]:
### Preprocessing function

def preprocess_val_test(data):
  ### Drop columns ####
  for column in cols_to_remove:
    data.drop([column], axis=1, inplace=True)

  children_uniq = uniq_values['children']
  children_numeric = children_numeric_converter(data['children'], children_uniq)
  data.drop(['children'], axis=1, inplace=True)
  data['children'] = children_numeric

  ageband_uniq = uniq_values['age_band']
  ageband_numeric = ageband_numeric_converter(data['age_band'], ageband_uniq)
  data.drop(['age_band'], axis=1, inplace=True)
  data['age_band'] = ageband_numeric

  gender_uniq = uniq_values['gender']
  gender_numeric = gender_numeric_converter(data['gender'], gender_uniq)
  data.drop(['gender'], axis=1, inplace=True)
  data['gender'] = gender_numeric

  ### Missing value Imputation ######
  missing_value_columns = list(data.isnull().sum()[data.isnull().sum()>0].index)

  for column in missing_value_columns:
    replacement_values = missing_replacement[column]
    if len(replacement_values) != 1:
      replace_with = np.random.choice(replacement_values, data[column].isnull().sum())
    else:
      replace_with = replacement_values[0]
    data[data[column].isnull(), column] = replace_with


  #### One hot encoding ######
  one_hot_converted_columns = ['status', 'occupation', 'occupation_partner', 'home_status', 'self_employed', 'self_employed_partner', 'TVarea', 'region', 'family_income']
  encoded_data = pd.DataFrame(encoder.transform(data[one_hot_converted_columns]).toarray())
  encoded_data.columns = encoder.get_feature_names_out()
  data.drop(one_hot_converted_columns, axis=1, inplace=True)
  data = pd.concat([data, encoded_data], axis=1)

  ### Ensure that the column ordering and the columns are same #####
  data = data[column_seq]

  return data

In [87]:
##### Load the validation set
##### Pass the validation set to the preprocess_test_val_funct

features_val = pd.read_csv('features_val.csv')

features_val = preprocess_val_test(features_val)

##### Load the model here ######
#### Predict from the model ######


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  children_data[~(children_data.isin(unique_values))] = "Zero" # This replaces every other value to Zero
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  children_data[children_data=='Zero'] = 0  # Converts Zero to numeric 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  children_data[children_data=='4+'] = 4    # Converts 4+ to 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-doc

In [89]:
features_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2031 entries, 0 to 2030
Data columns (total 90 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age_band                              2031 non-null   float64
 1   Average Credit Card Transaction       2031 non-null   float64
 2   Balance Transfer                      2031 non-null   float64
 3   Term Deposit                          2031 non-null   float64
 4   Life Insurance                        2031 non-null   float64
 5   Medical Insurance                     2031 non-null   float64
 6   Average A/C Balance                   2031 non-null   float64
 7   Personal Loan                         2031 non-null   float64
 8   Investment in Mutual Fund             2031 non-null   float64
 9   Investment Tax Saving Bond            2031 non-null   float64
 10  Home Loan                             2031 non-null   float64
 11  Online Purchase A