-
Notifications
You must be signed in to change notification settings - Fork 210
/
stat_bin.py
149 lines (125 loc) · 4.84 KB
/
stat_bin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import numpy as np
import pandas as pd
import pandas.core.common as com
from six.moves import range, zip
from ..utils import seq, make_iterable_ntimes
from ..utils.exceptions import GgplotError, gg_warn
from ..scales.utils import fullseq
from .stat import stat
class stat_bin(stat):
REQUIRED_AES = {'x'}
DEFAULT_PARAMS = {'geom': 'histogram', 'position': 'stack',
'width': 0.9, 'drop': False, 'right': False,
'binwidth': None, 'bins': None,
'origin': None, 'breaks': None}
DEFAULT_AES = {'y': '..count..', 'weight': None}
CREATES = {'y', 'width'}
def setup_params(self, data):
params = self.params
if 'y' in data or 'y' in params:
msg = "stat_bin() must not be used with a y aesthetic."
raise GgplotError(msg)
if data['x'].dtype.kind == 'i':
msg = ("stat_bin requires a continuous x variable the x "
"variable is discrete. "
"Perhaps you want stat='count'?")
raise GgplotError(msg)
if (params['breaks'] is None and
params['binwidth'] is None and
params['bins'] is None):
msg = ("'stat_bin()' using 'bins = 30'. "
"Pick better value with 'binwidth'.")
params = params.copy()
params['bins'] = 30
gg_warn(msg)
return params
@classmethod
def compute_group(cls, data, scales, **params):
params['range'] = np.asarray(scales.x.dimension())
return bin(data['x'], data.get('weight'), **params)
def bin(x, weight, **params):
x = np.asarray(x)
breaks = params['breaks']
right = params['right']
origin = params['origin']
rangee = params['range']
binwidth = params['binwidth']
num_bins = params['bins']
if num_bins is None:
num_bins = 30
if binwidth is None:
binwidth = np.ptp(rangee) / num_bins
if x.dtype == np.int:
bins = x
x = np.unique(x)
width = make_iterable_ntimes(params['width'], len(x))
elif np.diff(rangee) == 0:
bins = x
width = make_iterable_ntimes(params['width'], len(x))
elif com.is_numeric_dtype(x):
if breaks is None:
if origin is None:
breaks = fullseq(rangee, binwidth, pad=True)
else:
breaks = seq(origin, np.max(rangee)+binwidth,
binwidth)
fuzzybreaks = adjust_breaks(breaks, right)
bins = pd.cut(x, bins=fuzzybreaks, labels=False,
right=right)
width = np.diff(breaks)
x = [b+w/2 for (b, w) in zip(breaks[:-1], width)]
else:
# Proper scale trainning and mapping should never let
# the code path get here. If there is a problem here,
# something is probably wrong with the chosen scale
raise GgplotError("Cannot recognise the type of x")
# If weight not supplied to, use one (no weight)
if weight is None:
weight = np.ones(len(bins))
else:
weight = np.asarray(
make_iterable_ntimes(weight, len(bins)))
weight[np.isnan(weight)] = 0
# Create a dataframe with two columns:
# - the bins to which each x is assigned
# - the weight of each x value
# Then create a weighted frequency table
df = pd.DataFrame({'bins': bins,
'weight': weight})
wftable = pd.pivot_table(df, values='weight',
index=['bins'], aggfunc=np.sum)
# for categorical x
# Empty bins have NaN value, turn them to zeros
wftable.fillna(0, inplace=True)
# For numerical x values, empty bins get no value
# in the computed frequency table. We need to add the
# zeros and since frequency table is a Series object,
# we need to keep it ordered
if len(wftable) < len(x):
empty_bins = set(range(len(x))) - set(bins)
for b in empty_bins:
wftable.loc[b] = 0
wftable = wftable.sort_index()
count = wftable.tolist()
res = pd.DataFrame({
'x': x,
'count': count,
'width': width})
# other computed stats
res['density'] = (res['count'] / width) / res['count'].abs().sum()
res['ncount'] = res['count'] / res['count'].abs().max()
res['ndensity'] = res['density'] / res['density'].abs().max()
return res
def adjust_breaks(breaks, right):
# fuzzy breaks to protect from floating point rounding errors
diddle = 1e-07 * np.median(np.diff(breaks))
if right:
fuzz = np.hstack(
[-diddle, np.repeat(diddle, len(breaks)-1)])
else:
fuzz = np.hstack(
[np.repeat(-diddle, len(breaks)-1), diddle])
fuzzybreaks = breaks + fuzz
return fuzzybreaks