Skip to content

Commit

Permalink
PERF: Rank categorical perf
Browse files Browse the repository at this point in the history
closes pandas-dev#15498

Author: Prasanjit Prakash <jeet@gmail.com>

Closes pandas-dev#15518 from ikilledthecat/rank_categorical_perf and squashes the following commits:

30b49b9 [Prasanjit Prakash] PERF: GH15498 - pep8 changes
ad38544 [Prasanjit Prakash] PERF: GH15498 - asv tests and whatsnew
1ebdb56 [Prasanjit Prakash]  PERF: categorical rank GH#15498
a67cd85 [Prasanjit Prakash] PERF: categorical rank GH#15498
81df7df [Prasanjit Prakash]  PERF: categorical rank GH#15498
45dd125 [Prasanjit Prakash]  PERF: categorical rank GH#15498
33249b3 [Prasanjit Prakash] PERF: categorical rank GH#15498
  • Loading branch information
jeet63 authored and AnkurDedania committed Mar 21, 2017
1 parent 95f545d commit 9732cd6
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 9 deletions.
34 changes: 34 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,37 @@ def time_value_counts_dropna(self):

def time_rendering(self):
str(self.sel)


class Categoricals3(object):
goal_time = 0.2

def setup(self):
N = 100000
ncats = 100

self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats)))
self.s1_cat = self.s1.astype('category')
self.s1_cat_ordered = self.s1.astype('category', ordered=True)

self.s2 = Series(np.random.randint(0, ncats, size=N))
self.s2_cat = self.s2.astype('category')
self.s2_cat_ordered = self.s2.astype('category', ordered=True)

def time_rank_string(self):
self.s1.rank()

def time_rank_string_cat(self):
self.s1_cat.rank()

def time_rank_string_cat_ordered(self):
self.s1_cat_ordered.rank()

def time_rank_int(self):
self.s2.rank()

def time_rank_int_cat(self):
self.s2_cat.rank()

def time_rank_int_cat_ordered(self):
self.s2_cat_ordered.rank()
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,7 @@ Performance Improvements
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
- Improved performance of `rank()` for categorical data (:issue:`15498`)



Expand Down
1 change: 1 addition & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,7 @@ def _get_data_algo(values, func_map):
elif is_unsigned_integer_dtype(values):
f = func_map['uint64']
values = _ensure_uint64(values)

else:
values = _ensure_object(values)

Expand Down
9 changes: 8 additions & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,14 +1416,21 @@ def _values_for_rank(self):
numpy array
"""
from pandas import Series
if self.ordered:
values = self.codes
mask = values == -1
if mask.any():
values = values.astype('float64')
values[mask] = np.nan
else:
elif self.categories.is_numeric():
values = np.array(self)
else:
# reorder the categories (so rank can use the float codes)
# instead of passing an object array to rank
values = np.array(
self.rename_categories(Series(self.categories).rank())
)
return values

def order(self, inplace=False, ascending=True, na_position='last'):
Expand Down
33 changes: 25 additions & 8 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,8 +1065,10 @@ def test_rank_categorical(self):
exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
ordered = pd.Series(
['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
).astype('category', ).cat.set_categories(
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
).astype(
'category',
categories=['first', 'second', 'third',
'fourth', 'fifth', 'sixth'],
ordered=True
)
assert_series_equal(ordered.rank(), exp)
Expand All @@ -1075,19 +1077,33 @@ def test_rank_categorical(self):
# Unordered categoricals should be ranked as objects
unordered = pd.Series(
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
).astype('category').cat.set_categories(
['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
).astype(
'category',
categories=['first', 'second', 'third',
'fourth', 'fifth', 'sixth'],
ordered=False
)
exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
res = unordered.rank()
assert_series_equal(res, exp_unordered)

unordered1 = pd.Series(
[1, 2, 3, 4, 5, 6],
).astype(
'category',
categories=[1, 2, 3, 4, 5, 6],
ordered=False
)
exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.])
res1 = unordered1.rank()
assert_series_equal(res1, exp_unordered1)

# Test na_option for rank data
na_ser = pd.Series(
['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
).astype('category', ).cat.set_categories(
[
).astype(
'category',
categories=[
'first', 'second', 'third', 'fourth',
'fifth', 'sixth', 'seventh'
],
Expand Down Expand Up @@ -1123,8 +1139,9 @@ def test_rank_categorical(self):
# Test with pct=True
na_ser = pd.Series(
['first', 'second', 'third', 'fourth', np.NaN],
).astype('category').cat.set_categories(
['first', 'second', 'third', 'fourth'],
).astype(
'category',
categories=['first', 'second', 'third', 'fourth'],
ordered=True
)
exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
Expand Down

0 comments on commit 9732cd6

Please sign in to comment.