# Chapter 6: Importing, Cleaning and Analyzing Data

In [3]:
import numpy
import pandas as pd
from datlib import stats 
data = pd.read_excel("efw-2019-master-index-data-for-researchers.xlsx", 
                     index_col = [2,1], header = [4])

FileNotFoundError: [Errno 2] No such file or directory: 'efw-2019-master-index-data-for-researchers.xlsx'

In [None]:
data = pd.read_excel("efw-2019-master-index-data-for-researchers.xlsx", 
                     index_col = [2,1], 
                     header = [4],
                    sheet_name = "EFW Panel Data 2019 Report")
data

In [None]:
data = data.dropn(how = "all", axis = 1)

In [None]:
data.keys()

## Dropping countries from the columns

In [None]:
skip_keys = ["Countries"]
data_for_stats = drop.drop(skip_keys, 
                           axis = 1).drop.na(
                           how = "any", 
                           axi = 0)
data_for_stats

In [None]:
# Gets rid of the numbers in each of the column headings
keys = list(data_for_stats.keys())
data_for_stats.rename(
    columns = [key:key[3:].title() for key in keys],
    inplace = True)
data_for_stats

In [None]:
# Renames the first column because its name was deleted because
# We used the remove first three characters previously
data_for_stats.rename(columns = {"":"EFW"}, inplace = True)
data_for_stats

## Building dictionaries with statistics

In [None]:
stats_dict = {}
cov_dict = {}
corr_dict = {}

In [None]:
# vec = vector
for key1, vec1 in data_for_stats.items():
    stats_dict[key1] = {}
    stats_dict[key1]["mean"] = stats.mean(list(vec1))
    stats_dict[key1]["median"] = stats.median(vec1)
    stats_dict[key1]["variance"] = stats.variance(vec1)
    stats_dict[key1]["standard deviation"] = stats.SD(vec1, sample = True)
    stats_dict[key1]["skewness"] = stats.skewness(vec1, sample = True)
    stats_dict[key1]["kurtosis"] = stats.kurtosis(vec1, sample = True)
    # Creates new dictionaries for both cov and corr
    cov_dict[key1] = {}
    corr_dict[key1] = {}
    for key2, vec2 in data_for_stats.items():
        cov_dict[key1][key2] = stats.covariance(vec1, vec2, sample = True)
        corr_dict[key1][key2] = stats.correlation(vec1, vec2)

In [None]:
# Prints dataframe. T transposes it
# Put stats_df so we can call it at a later time
stats_df = pd.DataFrame(stats_dict).T
stats_df

In [None]:
cov_df = pd.Dataframe(cov_dict)
cov_df

In [None]:
corr_df = pd.Dataframe(corr_dict)
corr_df

In [None]:
Data_for_stats.to_csv("cleanedEconFreedomData.csv")

# Importing the data that was saved

In [None]:
data = pd.read_csv("cleanedEconFreedomData.csv"), index_col = [0,1])
data

# Visualizing data with scatter plots (3 Dim)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def color_dim_scatter(data, pp):
    # Removes the chance of any variable showing up more than once
    for x in data:
        for y in data:
            if x != y:
                for c in data:
                    if x != c and y != c:
                        fig, ax = plt.subplots(figsize = (20, 20))
                        data.plot.scatter(x = x, y = y, c = c,
                                         # Size of the dots = s
                                         s = 50,
                                         alpha = 0.7,
                                         colormap = "viridis",
                                         ax = ax)
                        plt.xticks(rotation = 90)
                        plt.setp(ax.get_ticklines(), visible = False)
                        plt.setp(ax.get_yticklines(), visible = False)
                        plt.show()
                        pp.savefig(fig, bbox_inches = "tight")
                        plt.close()
                        
pp = PdfPages("EconomicFreedomPlots.pdf")
plt.rcParams.update({"font.size":26})
color_dim_scatter(data, pp)
pp.close()