In [11]:
import pandas as pd
import statsmodels
data_1 = pd.DataFrame({"group":["a","a","a","b","b","b","b","c","c","c"],
                    "data":[4,2,1,32,25,45,4,2,7,1]})
data_1

Unnamed: 0,group,data
0,a,4
1,a,2
2,a,1
3,b,32
4,b,25
5,b,45
6,b,4
7,c,2
8,c,7
9,c,1


In [5]:
data_1.sort_values(by=["group","data"], ascending=[False,True], inplace=True)
data_1

Unnamed: 0,group,data
9,c,1
7,c,2
8,c,7
6,b,4
4,b,25
3,b,32
5,b,45
2,a,1
1,a,2
0,a,4


In [4]:
data_2 = pd.DataFrame({"k1":["one"]*3 + ["two"]*4,
                      "k2":[3,2,1,1,2,4,4]})
data_2

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,1
4,two,2
5,two,4
6,two,4


In [6]:
data_2.sort_values(by="k2")

Unnamed: 0,k1,k2
2,one,1
3,two,1
1,one,2
4,two,2
0,one,3
5,two,4
6,two,4


In [7]:
data_2.drop_duplicates()

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,1
4,two,2
5,two,4


In [9]:
data_2.drop_duplicates(subset="k2")

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
5,two,4


In [8]:
data_3 = pd.DataFrame({"food":["A1","A2","B1","B2","B3","C1","C2"], "data":[1,2,3,4,5,6,7]})
data_3

Unnamed: 0,data,food
0,1,A1
1,2,A2
2,3,B1
3,4,B2
4,5,B3
5,6,C1
6,7,C2


In [9]:
food_dict = {
    "A1":"A",
    "A2":"A",
    "B1":"B",
    "B2":"B",
    "B3":"B",
    "C1":"C",
    "C2":"C"
}
data_3["food_1"] = data_3["food"].map(food_dict)
data_3

Unnamed: 0,data,food,food_1
0,1,A1,A
1,2,A2,A
2,3,B1,B
3,4,B2,B
4,5,B3,B
5,6,C1,C
6,7,C2,C


In [None]:
def detect_peaks(x, mph=None, mpd=3, threshold=0, edge=None,
                 kpsh=False, valley=False, show=False, ax=None):
    """Detect peaks in data based on their amplitude and other features.

    Args:
    ----------
    x : 1D array_like
        data.
    mph : {None, number}, optional (default = None)
        detect peaks that are greater than minimum peak height.
    mpd : positive integer, optional (default = 1)
        detect peaks that are at least separated by minimum peak distance (in
        number of data).
    threshold : positive number, optional (default = 0)
        detect peaks (valleys) that are greater (smaller) than `threshold`
        in relation to their immediate neighbors.
    edge : {None, 'rising', 'falling', 'both'}, optional (default = 'rising')
        for a flat peak, keep only the rising edge ('rising'), only the
        falling edge ('falling'), both edges ('both'), or don't detect a
        flat peak (None).
    kpsh : bool, optional (default = False)
        keep peaks with same height even if they are closer than `mpd`.
    valley : bool, optional (default = False)
        if True (1), detect valleys (local minima) instead of peaks.
    show : bool, optional (default = False)
        if True (1), plot data in matplotlib figure.
    ax : a matplotlib.axes.Axes instance, optional (default = None).

    Returns
    -------
    ind : 1D array_like
        indeces of the peaks in `x`.

    Notes
    -----
    The detection of valleys instead of peaks is performed internally by simply
    negating the data: `ind_valleys = detect_peaks(-x)`
    
    The function can handle NaN's 

    See this IPython Notebook [1]_.

    References
    ----------
    .. [1] http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/DetectPeaks.ipynb

    Examples
    --------
    >>> from detect_peaks import detect_peaks
    >>> x = np.random.randn(100)
    >>> x[60:81] = np.nan
    >>> # detect all peaks and plot data
    >>> ind = detect_peaks(x, show=True)
    >>> # # print(ind)

    >>> x = np.sin(2*np.pi*5*np.linspace(0, 1, 200)) + np.random.randn(200)/5
    >>> # set minimum peak height = 0 and minimum peak distance = 20
    >>> detect_peaks(x, mph=0, mpd=20, show=True)

    >>> x = [0, 1, 0, 2, 0, 3, 0, 2, 0, 1, 0]
    >>> # set minimum peak distance = 2
    >>> detect_peaks(x, mpd=2, show=True)

    >>> x = np.sin(2*np.pi*5*np.linspace(0, 1, 200)) + np.random.randn(200)/5
    >>> # detection of valleys instead of peaks
    >>> detect_peaks(x, mph=0, mpd=20, valley=True, show=True)

    >>> x = [0, 1, 1, 0, 1, 1, 0]
    >>> # detect both edges
    >>> detect_peaks(x, edge='both', show=True)

    >>> x = [-2, 1, -2, 2, 1, 1, 3, 0]
    >>> # set threshold = 2
    >>> detect_peaks(x, threshold = 2, show=True)
    """

    x = np.atleast_1d(x).astype('float64')
    if x.size < 3:
        return np.array([], dtype=int)
    if valley:
        x = -x
    # find indices of all peaks
    dx = x[1:] - x[:-1]
    # handle NaN's
    indnan = np.where(np.isnan(x))[0]
    if indnan.size:
        x[indnan] = np.inf
        dx[np.where(np.isnan(dx))[0]] = np.inf
    ine, ire, ife = np.array([[], [], []], dtype=int)
    if not edge:
        ine = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) > 0))[0]
    else:
        if edge.lower() in ['rising', 'both']:
            ire = np.where((np.hstack((dx, 0)) <= 0) & (np.hstack((0, dx)) > 0))[0]
        if edge.lower() in ['falling', 'both']:
            ife = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) >= 0))[0]
    ind = np.unique(np.hstack((ine, ire, ife)))
    # handle NaN's
    if ind.size and indnan.size:
        # NaN's and values close to NaN's cannot be peaks
        ind = ind[np.in1d(ind, np.unique(np.hstack((indnan, indnan - 1, indnan + 1))), invert=True)]
    # first and last values of x cannot be peaks
    if ind.size and ind[0] == 0:
        ind = ind[1:]
    if ind.size and ind[-1] == x.size - 1:
        ind = ind[:-1]

    # remove peaks < minimum peak height
    if ind.size and mph is not None:
        if valley:
            ind = ind[-x[ind] >= mph]
        else:
            ind = ind[x[ind] >= mph]
    # remove peaks - neighbors < threshold
    if ind.size and threshold > 0:
        dx = np.min(np.vstack([x[ind] - x[ind - 1], x[ind] - x[ind + 1]]), axis=0)
        ind = np.delete(ind, np.where(dx < threshold)[0])
    # detect small peaks closer than minimum peak distance
    if ind.size and mpd > 1:
        ind = ind[np.argsort(x[ind])][::-1]  # sort ind by peak height
        idel = np.zeros(ind.size, dtype=bool)
        for i in range(ind.size):
            if not idel[i]:
                # keep peaks with the same height if kpsh is True
                idel = idel | (ind >= ind[i] - mpd) & (ind <= ind[i] + mpd) \
                       & (x[ind[i]] > x[ind] if kpsh else True)
                idel[i] = 0  # Keep current peak
        # remove the small peaks and sort back the indices by their occurrence
        ind = np.sort(ind[~idel])
    delind = [0, len(ind) - 2]
    ind = np.setdiff1d(ind, delind)
    if show:
        if indnan.size:
            x[indnan] = np.nan
        if valley:
            x = -x
        _plot(x, mph, mpd, threshold, edge, valley, ax, ind)

    return ind