# Explore the CDP Global 500 emissions (2011) data

In [None]:
import pandas as pd
import matplotlib
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt

%matplotlib inline

## Read data in from the csv file

In [None]:
#Read the CSV file in as a dataframe
df = pd.read_csv('data/2011_-_Global_500_Emissions_and_Response_Status.csv')

#Replace spaces in column names (Pandas doesn't like spaces)
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_')

#View the data
df

## Sort data on `Performance Band`

In [None]:
df.sort_values(by='Performance_Band',
               ascending=True)

## Filter data on `Country = USA`

In [None]:
#Filter: Select High performing companies in the USA
df.query('Country == "USA"')

## Aggregate data...

* Compute mean `Disclosure Score` by `Country` and sort from highest to lowest

In [None]:
seriesDS = df.groupby('Country')['Disclosure_Score'].mean()
seriesDS.sort_values(ascending=False,inplace=True)
seriesDS.dropna(inplace=True)
seriesDS

## Bar plot of mean `Disclosure score`

In [None]:
seriesDS.sort_values(inplace=True)
seriesDS.plot(kind='barh',
              figsize=(10,10),
              use_index=True,
              title = 'Average Disclosure Score'
             );

## Box plot disclosure scores by country

In [None]:
df[['Country','Disclosure_Score']].boxplot(by='Country',
                                           fontsize=20,
                                           rot=90,
                                           figsize=(30,10)
                                          );

## Plot histograms of all the data and data just for the USA

In [None]:
df['Disclosure_Score'].plot(kind='hist',figsize=(20,5),color='gray')
df[df['Country'] == 'USA']['Disclosure_Score'].plot(kind='hist',figsize=(20,5),color='blue');