# Prepocessing and Imputing

In [None]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import RobustScaler

## Read data

In [None]:
nations_data = pd.read_csv("nations.csv")
numerical_columns = ["gdp_percap",
                     "life_expect",
                     "population",
                     "birth_rate",
                     "neonat_mortal_rate"]
nations_data

## Upgrade scales
Higher order scales encode more information and are more useful to ML algorithms then lower order scales. It's worth checking if e.g. a nominal feature can be converted to ordinal or rational scales. This is the case for the income feature

"For the current 2023 fiscal year, low-income economies are defined as those with a GNI per capita, calculated using the World Bank Atlas method, of 1,085 or less in 2021; lower middle-income economies are those with a GNI per capita between 1,086 and 4,255; upper middle-income economies are those with a GNI per capita between 4,256 and 13,205; high-income economies are those with a GNI per capita of $13,205 or more."

https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups

https://data.worldbank.org/indicator/NY.GNP.PCAP.CD?end=2021&locations=XD-XM-XN-XT-OE&most_recent_value_desc=false&start=1962&view=map&year=2021

In [None]:
nations_data["income"].value_counts().plot.bar(color="#266662")
plt.show()

In [None]:
def replace_income(data):
    data["income"].replace(to_replace=["Low income"], value=741, inplace=True)
    data["income"].replace(to_replace=["Lower middle income"], value=2470, inplace=True)
    data["income"].replace(to_replace=["Upper middle income"], value=10358, inplace=True)
    data["income"].replace(to_replace=["High income"], value=48120, inplace=True)
    data["income"].replace(to_replace=["High income: OECD"], value=42360, inplace=True)
    data["income"].astype(int)
    
    return data

replace_income(nations_data)
nations_data["income"].plot.hist(color="#266662")
plt.xlabel("Average Gross National Income per Capita")
plt.show()

numerical_columns += ["income"]

## Outlier Detektion
Outliers can be detected by characterizing feature dirstributions or via machine learning algortihms (see e.g. https://scikit-learn.org/stable/modules/outlier_detection.html). Our apporach for now will be purely statistical, until we cover unsupervised method in the lectures.

In [None]:
data_2014 = nations_data["year"] == 2014
nations_data["population"][data_2014].plot.hist(bins=50, color="#266662")
plt.yscale("log")
plt.xlabel("Population")
plt.savefig("outlier_histo.png", dpi=200)
plt.show()

In [None]:
# determining mean and standard deviation of distribution
# write a function to return data points of a series that deviate more than 5 sigma of their standard deviation
# which countries are outside of the distribution?
mean = nations_data["population"][data_2014].mean()
std = nations_data["population"][data_2014].std()

print(mean)
print(std)
def find_outliers(data, sigma=5):
    mean = data.mean()
    std = data.std()
    is_outlier = np.abs(data) > mean + sigma * std
    return data[is_outlier]

outlier_index = find_outliers(nations_data["population"][data_2014]).index
print(nations_data.iloc[outlier_index])

## Scaling
The numerical features have vastly different scales and distributions, hence we better apply robust scaling

In [None]:
nations_data[numerical_columns].plot.box()
plt.title("Before Scaling")
plt.xticks(rotation=45)
plt.yscale("log")
plt.show()

In [None]:
scaler = RobustScaler()
nations_data[numerical_columns] = scaler.fit_transform(nations_data[numerical_columns])

nations_data[numerical_columns].plot.box()
plt.title("After Scaling")
plt.xticks(rotation=45)
plt.ylim((-4, 4))
plt.show()

## Check for missing values
In this case, there are quite a few values missing. Dropping full rows is not a good option here, since that would delete several countries from the data set

In [None]:
nas = nations_data.isna()
print("\n Sum of NAs in data: \n", nas.sum())
print("\n Countries with NAs: \n", nations_data.loc[nas.any(axis=1), "country"].value_counts()[:5])

## Imputation: Interpolation of Missing Values
It is often advisible to impute missing data using other features instead of dropping them. First, several known values are dorpped to test the accuracy of imputed values

In [None]:
impute_test_data = nations_data.dropna().copy()
drop_values = impute_test_data[impute_test_data["country"] == "Germany"].index
impute_test_data["gdp_percap"][drop_values] = np.nan
print(impute_test_data.isna().sum())

Now impute the missing values using sklearn imputers

In [None]:
from sklearn.impute import SimpleImputer
simple_imputer_data = pd.DataFrame()
imputer = SimpleImputer(strategy='mean', add_indicator=True)
simple_imputer_data[numerical_columns + ["indicator"]] = imputer.fit_transform(impute_test_data[numerical_columns])
imputed_index = simple_imputer_data[simple_imputer_data["indicator"] == 1.].index
imputed_index

In [None]:
from sklearn.impute import KNNImputer
knn_imputer_data = pd.DataFrame()
imputer = KNNImputer(add_indicator=True)
knn_imputer_data[numerical_columns + ["indicator"]] = imputer.fit_transform(impute_test_data[numerical_columns])

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(n_estimators=10)
iterative_imputer_data = pd.DataFrame()
imputer = IterativeImputer(estimator=estimator, add_indicator=True)
iterative_imputer_data[numerical_columns + ["indicator"]] = imputer.fit_transform(impute_test_data[numerical_columns])

In [None]:
original_values = nations_data[nations_data["country"] == "Germany"]
simply_imputed = simple_imputer_data.loc[imputed_index]
knn_imputed = knn_imputer_data.loc[imputed_index]
iterativly_imputed = iterative_imputer_data.loc[imputed_index]

plt.plot(original_values["year"], original_values["gdp_percap"],
         marker="o", label="Original Values", color="#266662")
plt.plot(original_values["year"], simply_imputed["gdp_percap"],
         marker="x", label="Simple Imputer Values", color="#ED5654")
plt.plot(original_values["year"], knn_imputed["gdp_percap"], 
         marker="v", label="KNN Imputer Values", color="#9E5E9B")
plt.plot(original_values["year"], iterativly_imputed["gdp_percap"],
         marker="s", label="Random Forrest Values", color="#0070C0")
plt.legend()
plt.xlabel("Year")
plt.ylabel("Scaled GDP per Capita")
plt.savefig("Imputer_Test.png", dpi=200)
plt.show()

## Impute Original Data
The iterative imputer is capable of imputing several features with missing values iteratively

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import IterativeImputer

knn = KNeighborsRegressor(n_neighbors=5)
iterative_imputer = IterativeImputer(estimator=knn, add_indicator=False)
nations_data[numerical_columns] = iterative_imputer.fit_transform(nations_data[numerical_columns])

nations_data.isna().sum()

In [None]:
nations_data.to_csv("preprocessed_nations_data.csv")

## Assembling a Data Pipeline
The above steps can be put into a pipeline for deployment. For this example, we'll perform a regression on the life expectancy. We'll use the life expectancy of Romania as a test case

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor

# declare life_expect as test data and split train and test set
nations_data = pd.read_csv("nations.csv")
valid = nations_data["life_expect"].notna()
# We'll use life_expect of Romania as test sample
test_country_index = nations_data[valid][nations_data["iso3c"] == "ROU"].index
test_target = nations_data.iloc[test_country_index]["life_expect"]
test_features = nations_data.iloc[test_country_index].drop(columns="life_expect")

train_target = nations_data["life_expect"][valid]
train_features = nations_data[valid].drop(columns="life_expect")

train_features.drop(test_country_index, inplace=True)
train_target.drop(test_country_index, inplace=True)
# for pipelining feature columns have to be enumerated
numerical_columns = ["gdp_percap",
                     "population",
                     "birth_rate",
                     "neonat_mortal_rate"]

def encoder(nations_data):
    nations_data["iso3c"] = nations_data["iso3c"].astype('category').cat.codes
    return nations_data

In [None]:
scale_converter = FunctionTransformer(replace_income)
label_encoder = FunctionTransformer(encoder)


scaler = ColumnTransformer([("passthrough", "passthrough", ["iso3c", "year"]),
                            ("robust_scaling", RobustScaler(), numerical_columns + ["income"])],
                            remainder="drop",
                            verbose_feature_names_out=False).set_output(transform="pandas")

imputer = ColumnTransformer([("imputer", iterative_imputer, numerical_columns + ["income"])],
                            remainder="passthrough",
                            verbose_feature_names_out=False).set_output(transform="pandas")

estimator = RandomForestRegressor(n_estimators=20)

pipeline = Pipeline([("preprocessor", scale_converter),
                     ("label_encoder", label_encoder),
                     ("scaler", scaler),
                     ("imputer", imputer),
                     ("estimator", estimator)
                     ])

pipeline.fit(train_features, train_target)
pipeline

In [None]:
predicted_values = pipeline.predict(test_features)
plot_data = test_features
plot_data["predicted_values"] = predicted_values
plot_data["target_values"] = test_target

plt.scatter(plot_data["year"], plot_data["target_values"],
            marker="o", label="Original Values", color="#266662")
plt.plot(plot_data["year"], plot_data["predicted_values"],
         marker="", label="Model Prediction", color="#ED5654")
plt.xlabel("Year")
plt.ylim(64,80)
plt.ylabel("Life Expectancy in Romania in Years")
plt.legend()
plt.show()