In [1]:
import matplotlib.pyplot as plt

from IPython.display import display, Image

import numpy as np
import pandas as pd
pd.set_option('precision', 2)
np.set_printoptions(precision=2)

### For EDA, you often produce summary tables (after an untold number of processing steps).  
### The summary table for categorical responses to a survey may look like this:

In [19]:
import pathlib
from pathlib import Path
import os

summary_data = Path('.').joinpath('data','ordered_survey_pct.csv' )
summary_data

WindowsPath('data/ordered_survey_pct.csv')

In [None]:
,Very interested,Somewhat interested,Not interested
Data Analysis,0.77,0.2,0.03
Machine Learning,0.75,0.22,0.03
Data Vis.,0.62,0.34,0.05
Big Data,0.61,0.33,0.06
Deep Learning,0.58,0.36,0.06
Data Journalism,0.2,0.51,0.29

In [22]:
df = pd.read_csv('./data/ordered_survey_pct.csv', header=0, names=['Very interested', 'Somewhat interested', 'Not interested'])
df

FileNotFoundError: File b'./data/ordered_survey_pct.csv' does not exist

In [None]:
display(Image('./assets/images/Survey_pct_table.png'))

## With this kind of data in a pandas dataframe, the typical visualization is a bar plot, which is produced using pandas plotting function:

> <span style="font-size:2em; color:darkblue;"> DataFrame.plot( kind='bar' , ...) </span>


## Default output after specifying the bars color, figure size & title:

In [None]:
plt.close('all');

In [None]:
colors = ['#5cb85c', '#5bc0de', '#d9534f']
ax = df.plot(kind='bar', color=colors, alpha=0.8, width=0.8, 
             figsize=(16, 6), fontsize=14,
             title="Percentage of Respondents' interest in Data Science Areas")

plt.savefig('./assets/images/barplot_First_output.png', format='png', )
plt.tight_layout();
plt.show()

## By default:
 -  <span style="font-size:2em; ">the y-axis is visible </span>
 -  <span style="font-size:2em; ">the plot frames, called __spines__, are visible </span>
 -  <span style="font-size:2em; ">the legend is placed in the upper right corner </span>


## Because the bar heights equal the values in the series, the y-axis is redundant and a more desirable output would be this one:  
![Image](Bar_plot_Percentage_of_Respondents.png)

# Anatomy of a bar plot via its *containers* collection:  

## Containers are defined for each data series. They have 2 attributes:  
> <span style=" font-size:2em;">__Attributes:__ </span>
>> <span style=" font-size:2em;"><span style="color:darkblue;">patches</span>: list of Rectangle objects. </span>  
    <span style=" font-size:1em;">The artists of the bars. </span>

>> <span style=" font-size:2em;">errorbar: None or ErrorbarContainer. </span>  
    <span style=" font-size:1em;">A container for the error bar artists if error bars are present. None otherwise.  </span>
   

In [None]:
print('Categories in df: {}\n'.format(df.index.size) )

for c in ax.containers:
    print(c.get_label(), '\t', len(c.patches))

# In the matplotlib object model, the bars, or rectangles, of a bar plot belong to the patches collection. You can access their values using the patches collection of an axis via the associated
><span style="font-size:2em; color:darkblue">*.get_...()* methods: </span>

In [None]:
for p in ax.patches:
    p_x = round(p.get_xy()[0], 2)
    p_h = round(p.get_height(), 2)
    
    print('p_x: {}, p_h: {}'.format(p_x, p_h)) 

# How to label the bars?  
><span style="font-size:2em; color:darkblue">*axis.text()* method: </span>

In [None]:
plt.close('all');

# Parameters:
size = (16, 6)
bar_width = 0.8
a = 0.8
font_size = 14
colors = ['#5cb85c', '#5bc0de', '#d9534f']

ax = df.plot(kind='bar', color=colors, alpha=a, figsize=size, fontsize=font_size);

ax.spines['left'].set_visible(False);
ax.spines['top'].set_visible(False);
ax.spines['right'].set_visible(False);
ax.spines['bottom'].set_color('#CCCCCC');

# Adjust vertical limits to 100% to get more whitespace below the title:
plt.ylim(0, 1)

# Remove axes tick marks:
plt.yticks([]); # remove both ticks and labels on y-axis
plt.tick_params(axis='x', which='major', bottom=False) # remove ticks only

# Annotate Text 
x_offset = 0.00  # (bar_width/len(df.columns))/2  #:: mid bar width
y_offset = 0.03

for p in ax.patches:
    p_x = p.get_xy()[0]
    p_h = p.get_height()
    ax.text( x=p_x + x_offset, y=p_h + y_offset , s="{:.0f}%".format(round(p_h*100, 0)), 
             ha='left', va='center', fontsize=11)
        
# NOTE: ha='left' goes with x_offset = 0. If instead ha='center',
#       the label will 'pre-hang' over the bar, so an x-position offset will be needed.       
    


# Reset font size from smaller default:
plt.legend(fontsize=14);

plt.title("Percentage of Respondents' interest in Data Science Areas", fontsize=16);

plt.savefig('barplot_Final_output.png', format='png', transparent=True)
plt.tight_layout();
plt.show();