<H1 align="center">Data Preprocessing</H1>

<br>
<strong>This notebook performs data preprocessing on the training and testing datasets and saves the preprocessed data to disk.</strong>

## Import Libraries

### Python Libraries

In [1]:
import warnings

### External Libraries

In [2]:
import pandas as pd

### Custom Libraries

In [3]:
from utils_data import PopulationData

## Set Parameters

### Data Paths

In [4]:
path_train_raw = '../../data/raw/train.csv'
path_test_raw = '../../data/raw/test.csv'
path_train_preprocessed = '../../data/processed/train.csv'
path_test_preprocessed = '../../data/processed/test.csv'

## Setup Environment

### Suppress Warnings

In [5]:
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

## Load Data

### Load Training Data

In [6]:
data_train = PopulationData().load_raw(path_train_raw)

### Load Testing Data

In [7]:
data_test = PopulationData().load_raw(path_test_raw)

## Preprocess Data

### Preprocess Training Data

In [8]:
data_train.encode_normalized_age()
data_train.encode_normalized_constitution()
data_train.encode_normalized_behavior()
data_train.encode_standardized_age()
data_train.encode_standardized_constitution()
data_train.encode_standardized_behavior()
data_train.encode_connection_lists()
data_train.encode_graph_nx()
data_train.encode_degree()
data_train.encode_degree_centrality()
data_train.encode_clustering_coefficient()
data_train.encode_normalized_degree()
data_train.encode_normalized_degree_centrality()
data_train.encode_normalized_clustering_coefficient()
data_train.encode_standardized_degree()
data_train.encode_standardized_degree_centrality()
data_train.encode_standardized_clustering_coefficient()
data_train.encode_connected_index_patient()
data_train.encode_distance_to_index_patient()
data_train.encode_normalized_distance_to_index_patient()
data_train.encode_standardized_distance_to_index_patient()
data_train.encode_sum_neighbor_age()
data_train.encode_sum_neighbor_constitution()
data_train.encode_sum_neighbor_behavior()
data_train.encode_sum_neighbor_degree()
data_train.encode_sum_neighbor_degree_centrality()
data_train.encode_sum_neighbor_clustering_coefficient()
data_train.encode_normalized_sum_neighbor_age()
data_train.encode_normalized_sum_neighbor_constitution()
data_train.encode_normalized_sum_neighbor_behavior()
data_train.encode_normalized_sum_neighbor_degree()
data_train.encode_normalized_sum_neighbor_degree_centrality()
data_train.encode_normalized_sum_neighbor_clustering_coefficient()
data_train.encode_standardized_sum_neighbor_age()
data_train.encode_standardized_sum_neighbor_constitution()
data_train.encode_standardized_sum_neighbor_behavior()
data_train.encode_standardized_sum_neighbor_degree()
data_train.encode_standardized_sum_neighbor_degree_centrality()
data_train.encode_standardized_sum_neighbor_clustering_coefficient()
data_train.encode_mean_neighbor_age()
data_train.encode_mean_neighbor_constitution()
data_train.encode_mean_neighbor_behavior()
data_train.encode_mean_neighbor_degree()
data_train.encode_mean_neighbor_degree_centrality()
data_train.encode_mean_neighbor_clustering_coefficient()
data_train.encode_normalized_mean_neighbor_age()
data_train.encode_normalized_mean_neighbor_constitution()
data_train.encode_normalized_mean_neighbor_behavior()
data_train.encode_normalized_mean_neighbor_degree()
data_train.encode_normalized_mean_neighbor_degree_centrality()
data_train.encode_normalized_mean_neighbor_clustering_coefficient()
data_train.encode_standardized_mean_neighbor_age()
data_train.encode_standardized_mean_neighbor_constitution()
data_train.encode_standardized_mean_neighbor_behavior()
data_train.encode_standardized_mean_neighbor_degree()
data_train.encode_standardized_mean_neighbor_degree_centrality()
data_train.encode_standardized_mean_neighbor_clustering_coefficient()
data_train.encode_sum_population_age()
data_train.encode_sum_population_constitution()
data_train.encode_sum_population_behavior()
data_train.encode_sum_population_degree()
data_train.encode_sum_population_degree_centrality()
data_train.encode_sum_population_clustering_coefficient()
data_train.encode_normalized_sum_population_age()
data_train.encode_normalized_sum_population_constitution()
data_train.encode_normalized_sum_population_behavior()
data_train.encode_normalized_sum_population_degree()
data_train.encode_normalized_sum_population_degree_centrality()
data_train.encode_normalized_sum_population_clustering_coefficient()
data_train.encode_standardized_sum_population_age()
data_train.encode_standardized_sum_population_constitution()
data_train.encode_standardized_sum_population_behavior()
data_train.encode_standardized_sum_population_degree()
data_train.encode_standardized_sum_population_degree_centrality()
data_train.encode_standardized_sum_population_clustering_coefficient()
data_train.encode_mean_population_age()
data_train.encode_mean_population_constitution()
data_train.encode_mean_population_behavior()
data_train.encode_mean_population_degree()
data_train.encode_mean_population_degree_centrality()
data_train.encode_mean_population_clustering_coefficient()
data_train.encode_normalized_mean_population_age()
data_train.encode_normalized_mean_population_constitution()
data_train.encode_normalized_mean_population_behavior()
data_train.encode_normalized_mean_population_degree()
data_train.encode_normalized_mean_population_degree_centrality()
data_train.encode_normalized_mean_population_clustering_coefficient()
data_train.encode_standardized_mean_population_age()
data_train.encode_standardized_mean_population_constitution()
data_train.encode_standardized_mean_population_behavior()
data_train.encode_standardized_mean_population_degree()
data_train.encode_standardized_mean_population_degree_centrality()
data_train.encode_standardized_mean_population_clustering_coefficient()
data_train.encode_sum_population_distance_to_index_patient()
data_train.encode_normalized_sum_population_distance_to_index_patient()
data_train.encode_standardized_sum_population_distance_to_index_patient()
data_train.encode_mean_population_distance_to_index_patient()
data_train.encode_normalized_mean_population_distance_to_index_patient()
data_train.encode_standardized_mean_population_distance_to_index_patient()

<utils_data.PopulationData.PopulationData at 0x24f8b57a810>

### Preprocess Testing Data

In [9]:
data_test.encode_normalized_age()
data_test.encode_normalized_constitution()
data_test.encode_normalized_behavior()
data_test.encode_standardized_age()
data_test.encode_standardized_constitution()
data_test.encode_standardized_behavior()
data_test.encode_connection_lists()
data_test.encode_graph_nx()
data_test.encode_degree()
data_test.encode_degree_centrality()
data_test.encode_clustering_coefficient()
data_test.encode_normalized_degree()
data_test.encode_normalized_degree_centrality()
data_test.encode_normalized_clustering_coefficient()
data_test.encode_standardized_degree()
data_test.encode_standardized_degree_centrality()
data_test.encode_standardized_clustering_coefficient()
data_test.encode_connected_index_patient()
data_test.encode_distance_to_index_patient()
data_test.encode_normalized_distance_to_index_patient()
data_test.encode_standardized_distance_to_index_patient()
data_test.encode_sum_neighbor_age()
data_test.encode_sum_neighbor_constitution()
data_test.encode_sum_neighbor_behavior()
data_test.encode_sum_neighbor_degree()
data_test.encode_sum_neighbor_degree_centrality()
data_test.encode_sum_neighbor_clustering_coefficient()
data_test.encode_normalized_sum_neighbor_age()
data_test.encode_normalized_sum_neighbor_constitution()
data_test.encode_normalized_sum_neighbor_behavior()
data_test.encode_normalized_sum_neighbor_degree()
data_test.encode_normalized_sum_neighbor_degree_centrality()
data_test.encode_normalized_sum_neighbor_clustering_coefficient()
data_test.encode_standardized_sum_neighbor_age()
data_test.encode_standardized_sum_neighbor_constitution()
data_test.encode_standardized_sum_neighbor_behavior()
data_test.encode_standardized_sum_neighbor_degree()
data_test.encode_standardized_sum_neighbor_degree_centrality()
data_test.encode_standardized_sum_neighbor_clustering_coefficient()
data_test.encode_mean_neighbor_age()
data_test.encode_mean_neighbor_constitution()
data_test.encode_mean_neighbor_behavior()
data_test.encode_mean_neighbor_degree()
data_test.encode_mean_neighbor_degree_centrality()
data_test.encode_mean_neighbor_clustering_coefficient()
data_test.encode_normalized_mean_neighbor_age()
data_test.encode_normalized_mean_neighbor_constitution()
data_test.encode_normalized_mean_neighbor_behavior()
data_test.encode_normalized_mean_neighbor_degree()
data_test.encode_normalized_mean_neighbor_degree_centrality()
data_test.encode_normalized_mean_neighbor_clustering_coefficient()
data_test.encode_standardized_mean_neighbor_age()
data_test.encode_standardized_mean_neighbor_constitution()
data_test.encode_standardized_mean_neighbor_behavior()
data_test.encode_standardized_mean_neighbor_degree()
data_test.encode_standardized_mean_neighbor_degree_centrality()
data_test.encode_standardized_mean_neighbor_clustering_coefficient()
data_test.encode_sum_population_age()
data_test.encode_sum_population_constitution()
data_test.encode_sum_population_behavior()
data_test.encode_sum_population_degree()
data_test.encode_sum_population_degree_centrality()
data_test.encode_sum_population_clustering_coefficient()
data_test.encode_normalized_sum_population_age()
data_test.encode_normalized_sum_population_constitution()
data_test.encode_normalized_sum_population_behavior()
data_test.encode_normalized_sum_population_degree()
data_test.encode_normalized_sum_population_degree_centrality()
data_test.encode_normalized_sum_population_clustering_coefficient()
data_test.encode_standardized_sum_population_age()
data_test.encode_standardized_sum_population_constitution()
data_test.encode_standardized_sum_population_behavior()
data_test.encode_standardized_sum_population_degree()
data_test.encode_standardized_sum_population_degree_centrality()
data_test.encode_standardized_sum_population_clustering_coefficient()
data_test.encode_mean_population_age()
data_test.encode_mean_population_constitution()
data_test.encode_mean_population_behavior()
data_test.encode_mean_population_degree()
data_test.encode_mean_population_degree_centrality()
data_test.encode_mean_population_clustering_coefficient()
data_test.encode_normalized_mean_population_age()
data_test.encode_normalized_mean_population_constitution()
data_test.encode_normalized_mean_population_behavior()
data_test.encode_normalized_mean_population_degree()
data_test.encode_normalized_mean_population_degree_centrality()
data_test.encode_normalized_mean_population_clustering_coefficient()
data_test.encode_standardized_mean_population_age()
data_test.encode_standardized_mean_population_constitution()
data_test.encode_standardized_mean_population_behavior()
data_test.encode_standardized_mean_population_degree()
data_test.encode_standardized_mean_population_degree_centrality()
data_test.encode_standardized_mean_population_clustering_coefficient()
data_test.encode_sum_population_distance_to_index_patient()
data_test.encode_normalized_sum_population_distance_to_index_patient()
data_test.encode_standardized_sum_population_distance_to_index_patient()
data_test.encode_mean_population_distance_to_index_patient()
data_test.encode_normalized_mean_population_distance_to_index_patient()
data_test.encode_standardized_mean_population_distance_to_index_patient()

<utils_data.PopulationData.PopulationData at 0x24fda3fa910>

## Optimize Dataframes

### Optimize Training Data

In [10]:
data_train.optimize()

### Optimize Testing Data

In [11]:
data_test.optimize()

## Save Data

### Save Training Data

In [12]:
data_train.save_processed(path_train_preprocessed)

### Save Testing Data

In [13]:
data_test.save_processed(path_test_preprocessed)