# Correlation

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [None]:
nations_data = pd.read_csv("preprocessed_nations_data.csv")
numerical_data = nations_data.select_dtypes(include=[np.float64])
scaler = StandardScaler()
scaled_data = pd.DataFrame()
scaled_data[numerical_data.columns] = scaler.fit_transform(numerical_data)
scaled_data.describe()

## Looking at Dependencies

### Birth Rate vs. Neonatal Mortality Rate

#### Variance
Variance can be used to measure the scatter of a single feature

In [None]:
data_sample = scaled_data.sample(frac=0.01, replace=False, random_state=1)
calculation_sample = data_sample.sample(n=6, random_state=1)
arbitrary_y_data = np.linspace(0, 1, num=len(data_sample))

plt.hist(data_sample["birth_rate"],
         bins=10, density=True, color="#C3DED3")
plt.scatter(data_sample["birth_rate"], arbitrary_y_data,
            color="#266662")

plt.ylabel("Arbitrary Scale / Standardized Frequency")
plt.xlabel("Birth Rate Sample")
plt.savefig("variance_plot.png", dpi=200)
plt.show()

print("Variance of birth rate data: ", np.var(data_sample["birth_rate"]))
print("Standard Deviation = Sqrt(Variance): ", np.std(data_sample["birth_rate"]))

#### Covariance
Covariance can be used to measure the common scatter of two variables, and hence gives a measure of their statistical dependence.

In [None]:
plt.scatter(data_sample["birth_rate"], data_sample["neonat_mortal_rate"],
            marker=".", color="#266662")
plt.scatter(calculation_sample["birth_rate"], calculation_sample["neonat_mortal_rate"],
            marker="x", color="#ED5654")
plt.xlabel("Standardized Birth Rate")
plt.ylabel("Standardized Neonatal Mortality Rate")

plt.savefig("covariance_plot.png", dpi=200)
plt.show()

##### Calculation Example

In [None]:
birth_rate_calculation = calculation_sample["birth_rate"].round(1)
neonat_rate_calculation = calculation_sample["neonat_mortal_rate"].round(1)

plt.scatter(birth_rate_calculation, neonat_rate_calculation,
            marker="x", color="#ED5654")
plt.vlines(birth_rate_calculation.mean().round(1), neonat_rate_calculation.min(), neonat_rate_calculation.max(),
          color="#9E5E9B")
plt.hlines(birth_rate_calculation.mean().round(1), birth_rate_calculation.min(), birth_rate_calculation.max(),
          color="#266662")

plt.xlabel("Standardized Birth Rate")
plt.ylabel("Standardized Neonatal Mortality Rate")
plt.savefig("covariance_calculation.png", dpi=200)
plt.show()


In [None]:
def covariance(x, y):
    deviations_x = x - np.mean(x)
    deviations_y = y - np.mean(y)
    return 1/len(x) * np.sum(deviations_x * deviations_y)
covariance_birth_neonat = covariance(data_sample["birth_rate"], data_sample["neonat_mortal_rate"])
print("Covariance between Birth Rate and Neonatal Mortality Rate: ", covariance_birth_neonat)




#### Pearson Correlation

In [None]:
# to calculate the correlation, we divide the covariance by the standard deviation of the two features
normalization = (np.std(data_sample["birth_rate"]) * np.std(data_sample["neonat_mortal_rate"]))
correlation = covariance_birth_neonat / normalization

plt.scatter(data_sample["birth_rate"], data_sample["neonat_mortal_rate"],
            marker=".", color="#266662")
plt.scatter(calculation_sample["birth_rate"], calculation_sample["neonat_mortal_rate"],
            marker="x", color="#ED5654")
plt.plot(data_sample["birth_rate"], correlation * data_sample["birth_rate"], color="#266662")

plt.xlabel("Standardized Birth Rate")
plt.ylabel("Standardized Neonatal Mortality Rate")
plt.savefig("lin_corr.png", dpi=200)
plt.show()
print("Correlation between Birth Rate and Neonatal Mortality Rate: ", correlation)


#### Spearman Correlation
Pearson correlation under estimates non-linear dependencies of features. Spearman Correlation is non-parametric correlation metric (does not assume a probability distribution of the data) and also works on ordinal scales. We'll investigate the standardized birthrate and the standardized GDP per capita.

In [None]:
plt.scatter(data_sample["birth_rate"], data_sample["gdp_percap"],
            marker=".", color="#266662")
plt.scatter(data_sample["birth_rate"][calculation_sample.index], data_sample["gdp_percap"][calculation_sample.index],
            marker="x", color="#ED5654")

plt.xlabel("Standardized Birth Rate")
plt.ylabel("Standardized GDP per Capita")
plt.savefig("birth_gdp.png", dpi=200)

plt.show()

Spearman correlation works like Pearson Correlation but uses the ranked values of the variables

In [None]:
# neat little trick: using argsort twice returns the rank of a feature. It is however not very efficient.
ranked_birth_rate = np.argsort(np.argsort(data_sample["birth_rate"]))
ranked_gdp = np.argsort(np.argsort(data_sample["gdp_percap"]))
ranked_covariance_birth_gdp = covariance(ranked_birth_rate, ranked_gdp)
normalization = (np.std(ranked_birth_rate) * np.std(ranked_gdp))
spearman_correlation = ranked_covariance_birth_gdp / normalization

stand_birth_rate = (ranked_birth_rate - np.mean(ranked_birth_rate)) / np.std(ranked_birth_rate)
stand_gdp = (ranked_gdp - np.mean(ranked_gdp)) / np.std(ranked_gdp)
plt.scatter(ranked_birth_rate, ranked_gdp, marker="o", color="#266662")
plt.scatter(ranked_birth_rate[calculation_sample.index], ranked_gdp[calculation_sample.index],
            marker="x", color="#ED5654")


plt.xlabel("Standardized Birth Rate Rank")
plt.ylabel("Standardized GDP per Capita Rank")
plt.savefig("ranked_plot.png", dpi=200)
plt.show()

In [None]:
plt.plot(stand_birth_rate, spearman_correlation * stand_birth_rate, color="#266662")
plt.scatter(stand_birth_rate, stand_gdp, marker="o", color="#266662")
plt.xlabel("Ranked Standardized Birth Rate")
plt.ylabel("Ranked Standardized GDP per Capita")
plt.savefig("Spearman_corr.png", dpi=200)
plt.show()
print(spearman_correlation)

### Korrelation Matrices
To get an overview, it helps to plot all possible pairs of correlations in a matrix.

In [None]:
# printing all pairs 2D scatter plots yields a good overview over the dependencies
fig, axes = plt.subplots(6, 6, figsize=(10,10), sharey=True, sharex=True)
plot_features = data_sample.columns

for row_index, axes_row in enumerate(axes):
    for column_index, ax in enumerate(axes_row):
        ax.scatter(data_sample[plot_features[row_index]],
                   data_sample[plot_features[column_index]],
                   color="#266662", marker=".")
        if column_index == 0:
            ax.set_ylabel(plot_features[row_index])
        if row_index == 5:
            ax.set_xlabel(plot_features[column_index])

plt.savefig("all_plots.png", dpi=200)
plt.show()

In [None]:
#We'll plot a matrix for each of the methods discussed above
fig, ax = plt.subplots(1, 3, figsize=(10,3), sharey=True)
methods = ["Covariance", "Pearson", "Spearman"]
matrix_list = list()
matrix_list += [numerical_data.cov()]
matrix_list += [numerical_data.corr(method="pearson")]
matrix_list += [numerical_data.corr(method="spearman")]

#loop over all three matrices
for plot_index, matrix in enumerate(matrix_list):
    
    # We'll print the values of each correlation in matrix
    for (x, y), value in np.ndenumerate(matrix):
        
        # Visualizing using the "coolwarm" color map
        ax[plot_index].matshow(matrix, cmap=plt.get_cmap("coolwarm")) 
        # printing values into the matrix cells
        ax[plot_index].text(x, y, f"{value:.1f}", va="center", ha="center")
        ax[plot_index].set_title(methods[plot_index])
        ax[plot_index].set_xticks(range(len(numerical_data.columns)),
                                  numerical_data.columns, rotation=90)
        ax[plot_index].set_yticks(range(len(numerical_data.columns)),
                                  numerical_data.columns)
        
plt.savefig("corralation_matrices.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
matrix_list[1].values