# Project 3 - Pandas for DS

In [None]:
import pandas as pd
import numpy as np

##  Helper functions  ................................................................
##
def get_energy():
    """
       Load the energy data from the file Energy Indicators.xls, which is a list of indicators 
       of energy supply and renewable electricity production from the United Nations for the 
       year 2013, and should be put into a DataFrame with the variable name of energy.
       This is an Excel file, and not a comma separated values file. 
       Exclude the footer and header information from the datafile. 
       The first two columns are unneccessary. Change the column labels so that the columns are:
       ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

       Convert Energy Supply to gigajoules (there are 1,000,000 gigajoules in a petajoule). 
       For all countries which have missing data (e.g. data with "...") change to np.NaN values.
       Rename the following list of countries (for use in later questions):
         "Republic of Korea": "South Korea"
         "United States of America": "United States"
         "United Kingdom of Great Britain and Northern Ireland": "United Kingdom"
         "China, Hong Kong Special Administrative Region": "Hong Kong"
         "Bolivia (Plurinational State of)": "Bolivia"
         "Switzerland17": "Switzerland"
    """
    rename_dict1 = { "Republic of Korea": "South Korea", 
                     "United States of America": "United States",
                     "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
                     "China, Hong Kong Special Administrative Region": "Hong Kong",
                     "Bolivia (Plurinational State of)": "Bolivia",
                     "Switzerland17": "Switzerland" }

    col_names = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

    footer_skipped = 31
    header_skipped = 9
        
    ### DataFrame df ###
    df = pd.read_excel( './data/Energy Indicators.xls', header=1, 
                         skip_footer=footer_skipped,
                         skiprows=header_skipped, 
                         parse_cols=[2, 3, 4, 5], names=col_names,
                         na_values=[ '...', '%', 'Renewable Electricity Production', 'Petajoules', 'Gigajoules' ], 
                         keep_default_na=True )                                

    # outliers at row 2, 5
    #wlink = 'website: http://unstats.un.org/unsd/ENVIRONMENT/qindicators.htm'
    df.loc[ [2], ['% Renewable'] ] = np.nan
    df.loc[ [5], ['Energy Supply','Energy Supply per Capita'] ] = [np.nan, np.nan]
    
    df.dropna(axis=0, how='all', inplace=True)

    # convert petajoules to gigajoules to match 'df Supply per Capita' units
    df['Energy Supply']*=10**6

    # remove any digits in the country column
    df['Country'] = df.Country.str.replace(r'[0-9]+','')
    
    # remove the part after ' (' form all countries that have parenthesis +strip
    df['Country'] = df.Country.str.split(' \(').str[0]
    df['Country']= df.Country.str.strip()
    
    # rename countries as indicated in dict1
    df['Country'].replace(rename_dict1, inplace = True)
    
    df = df.set_index('Country')
    #type_dict={ 'df Supply': 'float64', 'df Supply per Capita':'float64', '% Renewable': 'float64' }
    #df = df.astype( dtype=type_dict, copy=False ) # -> err?
    df['Energy Supply'] = df['Energy Supply'].astype('float64')
    df['Energy Supply per Capita'] = df['Energy Supply per Capita'].astype('float64')
    df['% Renewable'] = df['% Renewable'].astype('float64')
    
    return df


def get_gdp():
    """
        Load the GDP data from the file world_bank.csv, which is a csv containing 
        countries' GDP from 1960 to 2015 from World Bank. Call this DataFrame GDP.
        Make sure to skip the header, and rename the following list of countries:
          "Korea, Rep.": "South Korea", 
          "Iran, Islamic Rep.": "Iran", 
          "Hong Kong SAR, China": "Hong Kong"    
    """
    rename_dict2 = { "Korea, Rep.": "South Korea", "Iran, Islamic Rep.": "Iran",
                         "Hong Kong SAR, China": "Hong Kong" }

    df = pd.read_csv( './data/world_bank.csv' )
    #df.dropna(axis=0, how='all', inplace=True )
    df = df.loc[df.index[4:]]
    
    df['Data Source'].replace(rename_dict2, inplace = True)
    df.rename(columns={'Data Source': 'Country'}, inplace=True)
        
    df = df.set_index('Country' )
    df.drop(df.columns[0:49], axis=1, inplace=True )
    df.rename_axis({ val: str(2006+idx) for idx, val in enumerate(df.columns) }, axis=1, inplace=True )
    
    return df
    
    
def get_Scim():
    """
        Load the [Sciamgo Journal and Country Rank data for Energy Engineering and Power Technology] 
        (http://www.scimagojr.com/countryrank.php?category=2102) from the file scimagojr-3.xlsx, 
        which ranks countries based on their journal contributions in the aforementioned area. 
        Call this DataFrame ScimEn.
    """
    df = pd.read_excel( './data/scimagojr-3.xlsx' )
    df = df.set_index('Country')
    
    return df
    
    
def merge_dfs():
    """
        Join GDP, Energy, and ScimEn into a new dataset (using the intersection of country names). 
        Use only the last 10 years (2006-2015) of GDP data.
        The index of this DataFrame should be the name of the country, and the columns should be 
        ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 
        'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', 
        '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'].
    """
    energy = get_energy()
    GDP = get_gdp()
    ScimEn = get_Scim()
    
    ### merged df ###
    # the 1st version for merging is more pythonic, but Q2 cannot be answered if used
    #new_df = pd.merge(pd.merge(ScimEn[ ScimEn['Rank']<16 ], df, how='left', left_index=True, right_index=True ), 
    #                    df,  how='left', left_index=True, right_index=True )
    new_df = pd.merge( pd.merge(ScimEn, energy, how='left', left_index=True, right_index=True),
                           GDP, how='left', left_index=True, right_index=True )
    new_df.sort_values('Rank', inplace=True)
    
    return new_df


def answer_11():
    """
        Use the given ContinentDict to group the Countries by Continent.
        Return a dataframe.
    """
    # Used by answer_eleven() and answer_twelve() 
    #
    ContinentDict  = { 'China':'Asia', 
                       'United States':'North America', 
                       'Japan':'Asia', 
                       'United Kingdom':'Europe', 
                       'Russian Federation':'Europe', 
                       'Canada':'North America', 
                       'Germany':'Europe', 
                       'India':'Asia',
                       'France':'Europe', 
                       'South Korea':'Asia', 
                       'Italy':'Europe', 
                       'Spain':'Europe', 
                       'Iran':'Asia',
                       'Australia':'Australia', 
                       'Brazil':'South America' }

    df11 = answer_one()  # Indexed on Country
    df11['Continent'] = df11.index.copy()
    df11['Continent'].replace(ContinentDict, inplace=True)
    
    return df11

## Answer functions:  ...................................................................................
##
def answer_one():
    """ Returns the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15).
        out: a DataFrame with 20 columns and 15 entries.
    """
    merged_df = merge_dfs()
    merged_df = merged_df[ merged_df['Rank']<16 ]
    return merged_df


def answer_two():
    """
        Before the merged df was reduced to the top 15 items (in answer_one), how many entries were lost?
        Returns a single number.
        
        The answer depends on how the initial df was cleaned up: i.e. how many rows where dropped?
        (The number of header/footer rows to skip is not given.)
    """
    energy = get_energy()
    GDP = get_gdp()
    ScimEn = get_Scim()
    
    union = set(ScimEn.index) | set(energy.index) | set(GDP.index)
    inters = set(ScimEn.index) & set(energy.index) & set(GDP.index)
 
    return len( union - inters )


def answer_three():
    """
        Using the df from answer_one:
        What is the average GDP over the last 10 years for each country? (exclude missing values from this calculation.)
        Returns a Series named avgGDP with 15 countries and their average GDP sorted in descending order.
    """
    Top15 = answer_one()
    
    avgGDP = pd.Series( Top15[ Top15.columns[-10:] ].mean(axis=1) )
    avgGDP.dropna( inplace=True )
    avgGDP.sort_values( ascending=False, inplace=True )
    return avgGDP


def answer_four():
    """
        By how much had the GDP changed over the 10 year span for the country with the 6th largest average GDP?
        Returns a single number.
    """
    Top15 = answer_one()
    
    avgGDP_6_country = answer_three().index[5]
    return Top15.loc[ avgGDP_6_country, '2015' ] - Top15.loc[ avgGDP_6_country, '2006'] 


def answer_five():
    """
        What is the mean Energy Supply per Capita?
        Returns a single number.
    """
    Top15 = answer_one()
    return Top15['Energy Supply per Capita'].mean()


def answer_six():
    """
        What country has the maximum % Renewable and what is the percentage?
        Returns a tuple with the name of the country and the percentage.
    """
    Top15 = answer_one()
    ans = Top15.loc[ Top15['% Renewable'].argmax() ]
    return ( ans.name, ans['% Renewable'] )


def answer_seven():
    """
       Create a new column that is the ratio of Self-Citations to Total Citations. 
       What is the maximum value for this new column, and what country has the highest ratio?
       Returns a tuple with the name of the country and the ratio.
    """
    Top15 = answer_one()
    Top15['prop'] = Top15['Self-citations']/Top15['Citations']
    ans = Top15.loc[ Top15['prop'].argmax() ]
    return ( ans.name, ans['prop'] )


def answer_eight():
    """
       Create a column that estimates the population using 'Energy Supply' and 'Energy Supply 
       per capita'. What is the third most populous country according to this estimate?
       Returns a single string value.
    """
    Top15 = answer_one()
    Top15['pop_est'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15.sort_values(by='pop_est', ascending=False, inplace=True)
    'THIRD most populous'
    return Top15.index[2]


def answer_nine():
    """
        Create a column that estimates the number of citable documents per person. 
        What is the correlation between the number of citable documents per capita and 
        the energy supply per capita? Use the .corr() method, (Pearson's correlation).
        Return a single number.
    """
    Top15 = answer_one()
    Top15['pop_est'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15['cit_per_cap_est'] = Top15['Citable documents'].div(Top15['pop_est'])
    
    # 1st style more pythonic; acts on Series: no need for further locating using .loc:
    # ans = ( Top15.cit_per_cap_est.astype('float64')
    #         .corr(Top15['Energy Supply per Capita'].astype('float64')) )
    # -> 0.79400104354429424
    #
    # below is the 'matrix form' of the corr func on 2 cols:
    ans = ( Top15[['Energy Supply per Capita', 'cit_per_cap_est']].corr()
            .loc['Energy Supply per Capita', 'cit_per_cap_est'] )
    # -> 0.79400104354429435 (accepted answer)
    return ans


def answer_ten():
    """
        Create a new column with a 1 if the country's % Renewable value is at or above 
        the median for all countries in the top 15, and a 0 if the country's % Renewable 
        value is below the median.
        Return a series named HighRenew whose index is the country name sorted in ascending order of rank.
    """
    Top15 = answer_one()
    median_Renew = Top15['% Renewable'].median()
    category = lambda x: 1 if x >= median_Renew else 0
    Top15['HighRenew'] = Top15['% Renewable'].apply(category)

    return Top15['HighRenew']


def answer_eleven():
    """
        Create a dataframe that displays the sample size (the number of countries  
        in each continent bin), and the sum, mean, and std deviation for the estimated 
        population of each country.
    """
    Top15 = answer_11()  # :: Top15 w/Continents
    Top15['pop_est'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15['pop_est'] = Top15['pop_est'].astype(np.float64)
    
    grouped_df = Top15.groupby('Continent')
    stats_df = grouped_df['pop_est'].agg([len, np.sum, np.mean, np.std])
    stats_df.rename(columns={'len': 'size'}, inplace=True)

    return stats_df


def answer_twelve():
    """
        Cut % Renewable into 5 bins. Group Top15 by the Continent, as well as these 
        new % Renewable bins. How many countries are in each of these groups?
        Return a Series with a MultiIndex of Continent, then the bins for % Renewable. 
        Do not include groups without countries.
    """
    Top15 = answer_11()  # :: Top15 w/Continent column
    buckets = pd.cut(Top15['% Renewable'], bins=5)
    grouped_df = Top15.groupby(['Continent', buckets]).size()
    return grouped_df


def answer_thirteen():
    """
        Convert the Population Estimate series to a string with commas as thousands separators.
        Do not round the results, e.g. 317615384.61538464 -> 317,615,384.61538464
        Return a Series PopEst whose index is the country name and values the population estimate string.
    """
    Top15 = answer_one()
    Top15['PopEst'] = (Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])).astype(float)
    return Top15['PopEst'].apply(lambda x: '{0:,}'.format(x))

In [None]:
answer_one()

In [None]:
answer_two()

In [None]:
%%HTML
<head "Answer 2 hint" >
<svg width="800" height="300">
  <circle cx="150" cy="180" r="80" fill-opacity="0.2" stroke="black" stroke-width="2" fill="blue" />
  <circle cx="200" cy="100" r="80" fill-opacity="0.2" stroke="black" stroke-width="2" fill="red" />
  <circle cx="100" cy="100" r="80" fill-opacity="0.2" stroke="black" stroke-width="2" fill="green" />
  <line x1="150" y1="125" x2="300" y2="150" stroke="black" stroke-width="2" fill="black" stroke-dasharray="5,3"/>
  <text  x="300" y="165" font-family="Verdana" font-size="15">answer_two(): Everything but this!</text>
</svg>

In [None]:
answer_three()

In [None]:
answer_four()

In [None]:
answer_five()

In [None]:
answer_six()

In [None]:
answer_seven()

In [None]:
answer_eight()

In [None]:
answer_nine()

In [None]:
answer_ten()

In [None]:
answer_eleven()

In [None]:
answer_twelve()

In [None]:
answer_thirteen()

### Optional

Use the built in function `plot_optional()` to see an example visualization.

"""
def plot_optional():
    import matplotlib as plt
    import numpy as np  # added
    """
    ERROR: scatter requires y column to be numeric not fixed yet
    """
    #mat
    
    Top15 = answer_one()
    Top15.dropna(axis=0, how='any', inplace=True)  # added
    Top15['% Renewable']=Top15['% Renewable']*1    # added
    
    ax = Top15.plot( x='Rank', y='% Renewable', kind='scatter', 
                    c=['#e41a1c','#377eb8','#e41a1c','#4daf4a','#4daf4a','#377eb8','#4daf4a','#e41a1c',
                       '#4daf4a','#e41a1c','#4daf4a','#4daf4a','#e41a1c','#dede00','#ff7f00'], 
                    xticks=range(1,16), s=6*Top15['2014']/10**10, alpha=.75, figsize=[16,6]);
    
    for i, txt in enumerate(Top15.index):
        ax.annotate(txt, [Top15['Rank'][i], Top15['% Renewable'][i]], ha='center')

    print("This is an example of a visualization that can be created to help understand the data. \
This is a bubble chart showing % Renewable vs. Rank. The size of the bubble corresponds to the countries' \
2014 GDP, and the color corresponds to the continent.")
    
plot_optional()
"""