-
Notifications
You must be signed in to change notification settings - Fork 53
/
bootcamp_pandas-input.py
132 lines (108 loc) · 3.84 KB
/
bootcamp_pandas-input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Data input with Pandas for Data Bootcamp course.
Course materials
* http://databootcamp.nyuecon.com/
* https://github.com/DaveBackus/Data_Bootcamp
Warning: This is a working file and has some things in it that aren't
completely debugged.
Written by Dave Backus, August 2015
Created with Python 3.4
"""
"""
Read csv file from internet (and why we like csv's)
The result is a data frame: like a sheet with row and column labels
"""
import pandas as pd
# read file from url
url1 = 'https://raw.githubusercontent.com/DaveBackus'
url2 = '/Data_Bootcamp/master/Code/Python/test.csv'
url = url1 + url2
df = pd.read_csv(url)
# if the internet is down
#df_fromdict = pd.DataFrame({'name': ['Dave', 'Chase', 'Spencer'],
# 'x1': [1, 4, 5], 'x2': [2, 3, 6], 'x3': [3.5, 4.3, 7.8]})
#%%
"""
Examples
"""
import pandas as pd
# Penn World Table
url = 'http://www.rug.nl/research/ggdc/data/pwt/v81/pwt81.xlsx'
pwt = pd.read_excel(url, sheetname='Data')
#%%
# World Economic Outlook
url1 = 'https://www.imf.org/external/pubs/ft/weo/'
url2 = '2015/02/weodata/WEOOct2015all.xls'
weo = pd.read_csv(url1+url2,
sep='\t', # \t = tab
thousands=',', # kill commas
na_values=['n/a', '--']) # missing values
#%%
# PISA test scores
url = 'http://dx.doi.org/10.1787/888932937035'
pisa = pd.read_excel(url,
skiprows=18, # skip the first 18 rows
skipfooter=7, # skip the last 7
parse_cols=[0,1,9,13], # select columns of interest
index_col=0, # set the index as the first column
header=[0,1] # set the variable names
)
#%%
pisa = pisa.dropna() # drop blank lines
pisa.columns = ['Math', 'Reading', 'Science'] # simplify variable names
pisa['Math'].plot(kind='barh', figsize=(5, 12))
#%%
# UN population data
url1 = 'http://esa.un.org/unpd/wpp/DVD/Files/'
url2 = '1_Indicators%20(Standard)/EXCEL_FILES/1_Population/'
url3 = 'WPP2015_POP_F07_1_POPULATION_BY_AGE_BOTH_SEXES.XLS'
url = url1 + url2 + url3
cols = [2, 4, 5] + list(range(6,28))
est = pd.read_excel(url, sheetname=0, skiprows=16, parse_cols=cols)
#%%
# income by colleage major
url1 = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/'
url2 = 'college-majors/recent-grads.csv'
url = url1 + url2
df538 = pd.read_csv(url)
df538 = df538.set_index("Major")
df538["Median"].plot(kind="barh", figsize=(5, 12))
#%%
# IMDb movies and parts
url = 'http://pages.stern.nyu.edu/~dbackus/Data/cast.csv'
cast = pd.read_csv(url, encoding='utf-8')
#%%
"""
APIs
"""
import pandas.io.data as web # package to access FRED
import datetime as dt # package to handle dates
start = dt.datetime(2010, 1, 1) # start date
codes = ['GDPC1', 'PCECC96'] # real GDP, real consumption
fred = web.DataReader(codes, 'fred', start)
fred = fred/1000 # convert trillions to billions
fred.plot()
#%%
# World Bank
from pandas.io import wb # World Bank api
var = ['NY.GDP.PCAP.PP.KD'] # GDP per capita
iso = ['USA', 'FRA', 'JPN', 'CHN', 'IND', 'BRA', 'MEX'] # country codes
year = 2013
wbdf = wb.download(indicator=var, country=iso, start=year, end=year)
#%%
wbdf = wbdf.reset_index(level='year', drop=True)
wbdf.plot(kind='barh')
#%%
# Fama-French equity returns
import pandas.io.data as web
ff = web.DataReader('F-F_Research_Data_factors', 'famafrench')[0]
ff.columns = ['xsm', 'smb', 'hml', 'rf'] # rename variables
#%%
"""
Review
"""
data = {'EG.ELC.ACCS.ZS': [53.2, 47.3, 85.4, 22.1], # access to elec (%)
'IT.CEL.SETS.P2': [153.8, 95.0, 130.6, 74.8], # cell contracts per 100
'IT.NET.USER.P2': [11.5, 12.9, 41.0, 13.5], # internet access (%)
'Country': ['Botswana', 'Namibia', 'South Africa', 'Zambia']}
af = pd.DataFrame(data)