Code/Python/bootcamp_pandas-input.py

"""
Data input with Pandas for Data Bootcamp course.

Course materials
* http://databootcamp.nyuecon.com/
* https://github.com/DaveBackus/Data_Bootcamp

Warning:  This is a working file and has some things in it that aren't
completely debugged.

Written by Dave Backus, August 2015
Created with Python 3.4
"""
"""
Read csv file from internet (and why we like csv's)
The result is a data frame:  like a sheet with row and column labels
"""
import pandas as pd

# read file from url
url1 = 'https://raw.githubusercontent.com/DaveBackus'
url2 = '/Data_Bootcamp/master/Code/Python/test.csv'
url  = url1 + url2
df = pd.read_csv(url)

# if the internet is down
#df_fromdict = pd.DataFrame({'name': ['Dave', 'Chase', 'Spencer'],
#            'x1': [1, 4, 5], 'x2': [2, 3, 6], 'x3': [3.5, 4.3, 7.8]})

#%%
"""
Examples
"""
import pandas as pd

# Penn World Table
url = 'http://www.rug.nl/research/ggdc/data/pwt/v81/pwt81.xlsx'
pwt = pd.read_excel(url, sheetname='Data')

#%%
# World Economic Outlook
url1 = 'https://www.imf.org/external/pubs/ft/weo/'
url2 = '2015/02/weodata/WEOOct2015all.xls'
weo = pd.read_csv(url1+url2,
                  sep='\t',                 # \t = tab
                  thousands=',',            # kill commas
                  na_values=['n/a', '--'])  # missing values

#%%
# PISA test scores
url = 'http://dx.doi.org/10.1787/888932937035'
pisa = pd.read_excel(url,
                     skiprows=18,       # skip the first 18 rows
                     skipfooter=7,      # skip the last 7
                     parse_cols=[0,1,9,13],   # select columns of interest
                     index_col=0,       # set the index as the first column
                     header=[0,1]       # set the variable names
                     )


#%%
pisa = pisa.dropna()                            # drop blank lines
pisa.columns = ['Math', 'Reading', 'Science']   # simplify variable names
pisa['Math'].plot(kind='barh', figsize=(5, 12))

#%%
# UN population data
url1 = 'http://esa.un.org/unpd/wpp/DVD/Files/'
url2 = '1_Indicators%20(Standard)/EXCEL_FILES/1_Population/'
url3 = 'WPP2015_POP_F07_1_POPULATION_BY_AGE_BOTH_SEXES.XLS'
url = url1 + url2 + url3

cols = [2, 4, 5] + list(range(6,28))
est = pd.read_excel(url, sheetname=0, skiprows=16, parse_cols=cols)

#%%
# income by colleage major
url1 = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/'
url2 = 'college-majors/recent-grads.csv'
url = url1 + url2
df538 = pd.read_csv(url)
df538 = df538.set_index("Major")
df538["Median"].plot(kind="barh", figsize=(5, 12))

#%%
# IMDb movies and parts
url  = 'http://pages.stern.nyu.edu/~dbackus/Data/cast.csv'
cast = pd.read_csv(url, encoding='utf-8')

#%%
"""
APIs
"""
import pandas.io.data as web    # package to access FRED
import datetime as dt           # package to handle dates

start = dt.datetime(2010, 1, 1)  # start date
codes = ['GDPC1', 'PCECC96']    # real GDP, real consumption
fred  = web.DataReader(codes, 'fred', start)
fred = fred/1000                # convert trillions to billions

fred.plot()

#%%
# World Bank
from pandas.io import wb            # World Bank api

var = ['NY.GDP.PCAP.PP.KD']         # GDP per capita
iso = ['USA', 'FRA', 'JPN', 'CHN', 'IND', 'BRA', 'MEX']  # country codes
year = 2013
wbdf = wb.download(indicator=var, country=iso, start=year, end=year)

#%%
wbdf = wbdf.reset_index(level='year', drop=True)
wbdf.plot(kind='barh')

#%%
# Fama-French equity returns
import pandas.io.data as web

ff = web.DataReader('F-F_Research_Data_factors', 'famafrench')[0]
ff.columns = ['xsm', 'smb', 'hml', 'rf']      # rename variables

#%%
"""
Review
"""
data = {'EG.ELC.ACCS.ZS': [53.2, 47.3, 85.4, 22.1],    # access to elec (%)
        'IT.CEL.SETS.P2': [153.8, 95.0, 130.6, 74.8],  # cell contracts per 100
        'IT.NET.USER.P2': [11.5, 12.9, 41.0, 13.5],    # internet access (%)
        'Country': ['Botswana', 'Namibia', 'South Africa', 'Zambia']}
af = pd.DataFrame(data)