/
hinc.py
68 lines (50 loc) · 1.46 KB
/
hinc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function
import numpy as np
import pandas
import thinkplot
import thinkstats2
def Clean(s):
"""Converts dollar amounts to integers."""
try:
return int(s.lstrip('$').replace(',', ''))
except ValueError:
if s == 'Under':
return 0
elif s == 'over':
return np.inf
return None
def ReadData(filename='hinc06.csv'):
"""Reads filename and returns populations in thousands
filename: string
returns: pandas Series of populations in thousands
"""
data = pandas.read_csv(filename, header=None, skiprows=9)
cols = data[[0, 1]]
res = []
for _, row in cols.iterrows():
label, freq = row.values
freq = int(freq.replace(',', ''))
t = label.split()
low, high = Clean(t[0]), Clean(t[-1])
res.append((high, freq))
df = pandas.DataFrame(res)
# correct the first range
df.loc[0, 0] -= 1
# compute the cumulative sum of the freqs
df[2] = df[1].cumsum()
# normalize the cumulative freqs
total = df[2][41]
df[3] = df[2] / total
# add column names
df.columns = ['income', 'freq', 'cumsum', 'ps']
return df
def main():
df = ReadData()
print(df)
if __name__ == "__main__":
main()