# Analyse Support Tickets
author: Fabian Peschke<br>
created_at: 03.05.2023

In [22]:
from typing import List, Tuple, Dict

In [23]:
DATA_PATH: str = "./data/bitext_free_dataset.csv"

TEXT_COL: str = "utterance"
DATE_COL: str = "created_at"

## Step 1: Download the data
https://www.bitext.com/blog/free-customer-support-dataset/

## Step 2: Data ingestion

In [24]:
import pandas as pd
from pandas import DataFrame

In [25]:
raw_df: DataFrame = pd.read_csv(DATA_PATH)

Delete useless column

In [26]:
del raw_df["flags"]

shuffle dataset

In [27]:
raw_df = raw_df.sample(frac=1).reset_index(drop=True)

## Step 2.1: Add random dates

In [28]:
from datetime import datetime
from datetime import timedelta
import random

In [29]:
def generate_random_date(start_date: datetime, end_date: datetime, step: timedelta) -> datetime:
    """Generate a random date between start_date and end_date"""
    delta: timedelta = end_date - start_date
    offset: timedelta = timedelta(days=random.randint(0, delta.days))
    new_date: datetime = start_date + offset
    for _ in range(random.randint(0, int(timedelta(hours=24) / step))):
        new_date += step
    return new_date

In [30]:
start = datetime(year=2020, month=1, day=1)
end = datetime(year=2020, month=3, day=31)
step = timedelta(minutes=15) # every 15 minutes

In [31]:
raw_df[DATE_COL] = raw_df[TEXT_COL].apply(lambda x: generate_random_date(start, end, step))

## Step 3: Preprocess data

In [32]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import contractions

In [33]:
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to /Users/fabian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/fabian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Define usefull cleaning functions

In [34]:
lemmatizer = WordNetLemmatizer()
TAGS: Dict[str, str] = {
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "J": wordnet.ADJ,
    "R": wordnet.ADV,
}


def unicode(sentence: str) -> str:
    """Apply casefold on str"""
    return sentence.casefold()


def tag_words(words: List[str]) -> List[Tuple[str, str]]:
    """Tag word type"""
    tagged_words: List[Tuple[str, str]] = pos_tag(words)
    return list(map(lambda word: (word[0], TAGS.get(word[1][0], wordnet.NOUN)), tagged_words))


def lemmatize(words: List[str]) -> List[str]:
    """Lemmatize all words sentence"""
    word_tags = tag_words(words)
    return [lemmatizer.lemmatize(word, pos=tag) for word, tag in word_tags]

In [35]:
decoded_df = lemmatized_df = const_fixed_df = raw_df
decoded_df[TEXT_COL] = raw_df[TEXT_COL].apply(unicode) # casefold + lowercase
const_fixed_df[TEXT_COL] = decoded_df[TEXT_COL].apply(contractions.fix) # casefold + lowercase
lemmatized_df[TEXT_COL] = decoded_df[TEXT_COL].apply(lambda sentence: " ".join(lemmatize(word_tokenize(sentence)))) # lemmatize

In [36]:
cleaned_df = lemmatized_df

In [37]:
cleaned_df.to_csv("preprocessed.csv")

## Step 4: Find Topics in Data
Find the topics using BERTopic and convert the topic names using OpenAI-GPT3

In [38]:
from bertopic import BERTopic

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [39]:
topic_model = BERTopic(
    nr_topics="auto",
    verbose=True
)

In [40]:
topics, probs = topic_model.fit_transform(cleaned_df[TEXT_COL])

Batches: 100%|██████████| 673/673 [01:33<00:00,  7.20it/s]
2023-05-05 23:49:09,426 - BERTopic - Transformed documents to Embeddings
2023-05-05 23:49:46,220 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-05-05 23:49:48,127 - BERTopic - Clustered reduced embeddings
2023-05-05 23:50:04,821 - BERTopic - Reduced number of topics from 720 to 217


In [41]:
topic_model.save("ticket_topic.model")

Visulize model perfomance

In [42]:
topic_model.visualize_barchart()

In [43]:
topic_model.visualize_term_rank()

In [44]:
topic_model.visualize_term_rank(log_scale=True)

## Step 5: Prepare for anomaly detection

In [45]:
to_clean_df = cleaned_df
del to_clean_df[TEXT_COL]
del to_clean_df["intent"]
del to_clean_df["category"]

In [46]:
GROUP: str = "group"

In [47]:
to_clean_df[GROUP] = topics

Count the occurence

In [48]:
counted_df = to_clean_df.groupby([GROUP, DATE_COL]).count().reset_index().assign(COUNT=lambda x: x.iloc[:,0])

In [49]:
duplicates_df = to_clean_df.groupby([GROUP, DATE_COL]).apply(lambda x: x.duplicated().sum()).reset_index(name='DUPLICATES')

In [50]:
duplicates_df["DUPLICATES"] = duplicates_df["DUPLICATES"].apply(lambda count: count+1)

Fill missing values

In [51]:
def fill_missing_dates(df: DataFrame, date_col: str, freq: str="15min") -> DataFrame:
    """Fill missing dates of group"""
    min_date = df[date_col].min()
    max_date = df[date_col].max()
    new_dates = pd.date_range(min_date, max_date, freq=freq)
    all_dates = pd.DataFrame({date_col: new_dates})
    
    # merge the new dataframe with the original df
    df = pd.merge(df, all_dates, on=date_col, how='outer')
    
    # sort by date_col and fill missing values with new dates
    df = df.sort_values(by=date_col)
    df[date_col] = df[date_col].fillna(method='ffill')
    df = df.ffill()
    return df

In [52]:
filled_df: DataFrame = duplicates_df.groupby(GROUP).apply(lambda df: fill_missing_dates(df, DATE_COL))

## Step 6: Anomaly Detection

Convert df for tad

In [53]:
filled_df = filled_df.set_index(DATE_COL)

Find anomalies

In [63]:
import numpy as np
import scipy as sp
import pandas as pd
import datetime
import statsmodels.api as sm
import logging


logger = logging.getLogger(__name__)


def _handle_granularity_error(level):
    """
    Raises ValueError with detailed error message if one of the two situations is true:
      1. calculated granularity is less than minute (sec or ms)
      2. resampling is not enabled for situations where calculated granularity < min
      level : String
        the granularity that is below the min threshold
    """
    e_message = '%s granularity is not supported. Ensure granularity => minute or enable resampling' % level
    raise ValueError(e_message)


def _resample_to_min(data, period_override=None):
    """
    Resamples a data set to the min level of granularity
      data : pandas DataFrame
        input Pandas DataFrame
      period_override : int
        indicates whether resampling should be done with overridden value instead of min (1440)
    """
    data = data.resample('60s', label='right').sum()
    if _override_period(period_override):
        period = period_override
    else:
        period = 1440
    return (data, period)


def _override_period(period_override):
    """
    Indicates whether period can be overridden if the period derived from granularity does
    not match the generated period.
      period_override : int
        the user-specified period that overrides the value calculated from granularity
    """
    return period_override is not None


def _get_period(gran_period, period_arg=None):
    """
    Returns the generated period or overridden period depending upon the period_arg
      gran_period : int
        the period generated from the granularity
      period_arg : the period override value that is either None or an int
        the period to override the period generated from granularity
    """
    if _override_period(period_arg):
        return period_arg
    else:
        return gran_period


def _get_data_tuple(raw_data, period_override, resampling=False):
    """
    Generates a tuple consisting of processed input data, a calculated or overridden period, and granularity
      raw_data : pandas DataFrame
        input data
      period_override : int
        period specified in the anomaly_detect_ts parameter list, None if it is not provided
      resampling : True | False
        indicates whether the raw_data should be resampled to a supporting granularity, if applicable
    """
    data = raw_data.sort_index()
    timediff = _get_time_diff(data)

    if timediff.days > 0:
        period = _get_period(7, period_override)
        granularity = 'day'
    elif timediff.seconds / 60 / 60 >= 1:
        granularity = 'hr'
        period = _get_period(24, period_override)
    elif timediff.seconds / 60 >= 1:
        granularity = 'min'
        period = _get_period(1440, period_override)
    elif timediff.seconds > 0:
        granularity = 'sec'
    elif timediff.seconds > 0:
        granularity = 'sec'

        '''
           Aggregate data to minute level of granularity if data stream granularity is sec and
           resampling=True. If resampling=False, raise ValueError
        '''
        if resampling is True:
            period = _resample_to_min(data, period_override)
        else:
            _handle_granularity_error('sec')
    else:
        '''
           Aggregate data to minute level of granularity if data stream granularity is ms and
           resampling=True. If resampling=False, raise ValueError
        '''
        if resampling is True:
            data, period = _resample_to_min(data, period_override)
            granularity = None
        else:
            _handle_granularity_error('ms')

    return (data, period, granularity)


def _get_time_diff(data):
    """
    Generates the time difference used to determine granularity and
    to generate the period
      data : pandas DataFrame
        composed of input data
    """
    return data.index[1] - data.index[0]


def _get_max_anoms(data, max_anoms):
    """
    Returns the max_anoms parameter used for S-H-ESD time series anomaly detection
      data : pandas DataFrame
        composed of input data
      max_anoms : float
        the input max_anoms
    """
    if max_anoms == 0:
        logger.warning('0 max_anoms results in max_outliers being 0.')
    return 1 / data.size if max_anoms < 1 / data.size else max_anoms


def _process_long_term_data(data, period, granularity, piecewise_median_period_weeks):
    """
    Processes result set when longterm is set to true
      data : list of floats
        the result set of anoms
      period : int
        the calculated or overridden period value
      granularity : string
        the calculated or overridden granularity
      piecewise_median_period_weeks : int
        used to determine days and observations per period
    """
    # Pre-allocate list with size equal to the number of piecewise_median_period_weeks chunks in x + any left over chunk
    # handle edge cases for daily and single column data period lengths
    num_obs_in_period = period * piecewise_median_period_weeks + \
        1 if granularity == 'day' else period * 7 * piecewise_median_period_weeks
    num_days_in_period = (7 * piecewise_median_period_weeks) + \
        1 if granularity == 'day' else (7 * piecewise_median_period_weeks)

    all_data = []
    # Subset x into piecewise_median_period_weeks chunks
    for i in range(1, data.size + 1, num_obs_in_period):
        start_date = data.index[i]
        # if there is at least 14 days left, subset it, otherwise subset last_date - 14 days
        end_date = start_date + datetime.timedelta(days=num_days_in_period)
        if end_date < data.index[-1]:
            all_data.append(
                data.loc[lambda x: (x.index >= start_date) & (x.index <= end_date)])
        else:
            all_data.append(
                data.loc[lambda x: x.index >= data.index[-1] - datetime.timedelta(days=num_days_in_period)])
    return all_data


def _get_only_last_results(data, all_anoms, granularity, only_last):
    """
    Returns the results from the last day or hour only
      data : pandas DataFrame
        input data set
      all_anoms : list of floats
        all of the anomalies returned by the algorithm
      granularity : string day | hr | min
        The supported granularity value
      only_last : string day | hr
        The subset of anomalies to be returned
    """
    start_date = data.index[-1] - datetime.timedelta(days=7)
    start_anoms = data.index[-1] - datetime.timedelta(days=1)

    if only_last == 'hr':
        # We need to change start_date and start_anoms for the hourly only_last option
        start_date = datetime.datetime.combine(
            (data.index[-1] - datetime.timedelta(days=2)).date(), datetime.time.min)
        start_anoms = data.index[-1] - datetime.timedelta(hours=1)

    # subset the last days worth of data
    x_subset_single_day = data.loc[data.index > start_anoms]
    # When plotting anoms for the last day only we only show the previous weeks data
    x_subset_week = data.loc[lambda df: (
        df.index <= start_anoms) & (df.index > start_date)]
    return all_anoms.loc[all_anoms.index >= x_subset_single_day.index[0]]


def _get_plot_breaks(granularity, only_last):
    """
    Generates the breaks used in plotting
      granularity : string
        the supported granularity value
      only_last : True | False
        indicates whether only the last day or hour is returned and to be plotted
    """
    if granularity == 'day':
        breaks = 3 * 12
    elif only_last == 'day':
        breaks = 12
    else:
        breaks = 3
    return breaks


def _perform_threshold_filter(anoms, periodic_max, threshold):
    """
    Filters the list of anomalies per the threshold filter
      anoms : list of floats
        the anoms returned by the algorithm
      periodic_max : float
        calculated daily max value
      threshold : med_max" | "p95" | "p99"
        user-specified threshold value used to filter anoms
    """
    if threshold == 'med_max':
        thresh = periodic_max.median()
    elif threshold == 'p95':
        thresh = periodic_max.quantile(0.95)
    elif threshold == 'p99':
        thresh = periodic_max.quantile(0.99)
    else:
        raise AttributeError(
            'Invalid threshold, threshold options are None | med_max | p95 | p99')

    return anoms.loc[anoms.values >= thresh]


def _get_max_outliers(data, max_percent_anomalies):
    """
    Calculates the max_outliers for an input data set
      data : pandas DataFrame
        the input data set
      max_percent_anomalies : float
        the input maximum number of anomalies per percent of data set values
    """
    max_outliers = int(np.trunc(data.size * max_percent_anomalies))
    assert max_outliers, 'With longterm=True, AnomalyDetection splits the data into 2 week periods by default. You have {0} observations in a period, which is too few. Set a higher piecewise_median_period_weeks.'.format(
        data.size)
    return max_outliers


def _get_decomposed_data_tuple(data, num_obs_per_period):
    """
    Returns a tuple consisting of two versions of the input data set: seasonally-decomposed and smoothed
      data : pandas DataFrame
        the input data set
      num_obs_per_period : int
        the number of observations in each period
    """
    decomposed = sm.tsa.seasonal_decompose(
        data, period=num_obs_per_period, two_sided=False)
    smoothed = data - decomposed.resid.fillna(0)
    data = data - decomposed.seasonal - data.mean()
    return (data, smoothed)


def anomaly_detect_ts(x, max_anoms=0.1, direction="pos", alpha=0.05, only_last=None,
                      threshold=None, e_value=False, longterm=False, piecewise_median_period_weeks=2,
                      plot=False, y_log=False, xlabel="", ylabel="count", title='shesd output: ', verbose=False,
                      dropna=False, resampling=False, period_override=None):

    if verbose:
        logger.setLevel(logging.DEBUG)
        logger.debug("The debug logs will be logged because verbose=%s", verbose)

    # validation
    assert isinstance(x, pd.Series), 'Data must be a series(Pandas.Series)'
    assert x.values.dtype in [int, float], 'Values of the series must be number'
    assert x.index.dtype == np.dtype('datetime64[ns]'), 'Index of the series must be datetime'
    assert max_anoms <= 0.49 and max_anoms >= 0, 'max_anoms must be non-negative and less than 50% '
    assert direction in ['pos', 'neg', 'both'], 'direction options: pos | neg | both'
    assert only_last in [None, 'day', 'hr'], 'only_last options: None | day | hr'
    assert threshold in [None, 'med_max', 'p95', 'p99'], 'threshold options: None | med_max | p95 | p99'
    assert piecewise_median_period_weeks >= 2, 'piecewise_median_period_weeks must be greater than 2 weeks'
    logger.debug('Completed validation of input parameters')

    if alpha < 0.01 or alpha > 0.1:
        logger.warning('alpha is the statistical significance and is usually between 0.01 and 0.1')

    data, period, granularity = _get_data_tuple(x, period_override, resampling)
    if granularity is 'day':
        num_days_per_line = 7
        only_last = 'day' if only_last == 'hr' else only_last

    max_anoms = _get_max_anoms(data, max_anoms)

    # If longterm is enabled, break the data into subset data frames and store in all_data
    all_data = _process_long_term_data(data, period, granularity, piecewise_median_period_weeks) if longterm else [data] 
    all_anoms = pd.Series()
    seasonal_plus_trend = pd.Series()

    # Detect anomalies on all data (either entire data in one-pass, or in 2 week blocks if longterm=True)
    for series in all_data:
        shesd = _detect_anoms(series, k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=True,
                              use_esd=False, direction=direction, verbose=verbose)
        shesd_anoms = shesd['anoms']
        shesd_stl = shesd['stl']

        # -- Step 3: Use detected anomaly timestamps to extract the actual anomalies (timestamp and value) from the data
        anoms = pd.Series() if shesd_anoms.empty else series.loc[shesd_anoms.index]

        # Filter the anomalies using one of the thresholding functions if applicable
        if threshold:
            # Calculate daily max values
            periodic_max = data.resample('1D').max()
            anoms = _perform_threshold_filter(anoms, periodic_max, threshold)

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(shesd_stl)

    # De-dupe
    all_anoms.drop_duplicates(inplace=True)
    seasonal_plus_trend.drop_duplicates(inplace=True)

    # If only_last is specified, create a subset of the data corresponding to the most recent day or hour
    if only_last:
        all_anoms = _get_only_last_results(
            data, all_anoms, granularity, only_last)

    # If there are no anoms, log it and return an empty anoms result
    if all_anoms.empty:
        if verbose:
            logger.info('No anomalies detected.')

        return {
            'anoms': pd.Series(),
            'plot': None
        }

    if plot:
        # TODO additional refactoring and logic needed to support plotting
        num_days_per_line
        #breaks = _get_plot_breaks(granularity, only_last)
        # x_subset_week
        raise Exception('TODO: Unsupported now')

    return {
        'anoms': all_anoms,
        'expected': seasonal_plus_trend if e_value else None,
        'plot': 'TODO' if plot else None
    }


def _detect_anoms(data, k=0.49, alpha=0.05, num_obs_per_period=None,
                  use_decomp=True, use_esd=False, direction="pos", verbose=False):
    """
    Detects anomalies in a time series using S-H-ESD.
    Args:
         data: Time series to perform anomaly detection on.
         k: Maximum number of anomalies that S-H-ESD will detect as a percentage of the data.
         alpha: The level of statistical significance with which to accept or reject anomalies.
         num_obs_per_period: Defines the number of observations in a single period, and used during seasonal decomposition.
         use_decomp: Use seasonal decomposition during anomaly detection.
         use_esd: Uses regular ESD instead of hybrid-ESD. Note hybrid-ESD is more statistically robust.
         one_tail: If TRUE only positive or negative going anomalies are detected depending on if upper_tail is TRUE or FALSE.
         upper_tail: If TRUE and one_tail is also TRUE, detect only positive going (right-tailed) anomalies. If FALSE and one_tail is TRUE, only detect negative (left-tailed) anomalies.
         verbose: Additionally printing for debugging.
    Returns:
       A list containing the anomalies (anoms) and decomposition components (stl).
    """

    # validation
    assert num_obs_per_period, "must supply period length for time series decomposition"
    assert direction in ['pos', 'neg',
                         'both'], 'direction options: pos | neg | both'
    assert data.size >= num_obs_per_period * \
        2, 'Anomaly detection needs at least 2 periods worth of data'
    assert data[data.isnull(
    )].empty, 'Data contains NA. We suggest replacing NA with interpolated values before detecting anomaly'

    # conversion
    one_tail = True if direction in ['pos', 'neg'] else False
    upper_tail = True if direction in ['pos', 'both'] else False

    # -- Step 1: Decompose data. This returns a univariate remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
    # Note: R use stl, but here we will use MA, the result may be different TODO.. Here need improvement
    #decomposed = sm.tsa.seasonal_decompose(data, freq=num_obs_per_period, two_sided=False)
    #smoothed = data - decomposed.resid.fillna(0)
    #data = data - decomposed.seasonal - data.mean()

    data, smoothed = _get_decomposed_data_tuple(data, num_obs_per_period)

    max_outliers = _get_max_outliers(data, k)

    R_idx = pd.Series()

    n = data.size
    # Compute test statistic until r=max_outliers values have been
    # removed from the sample.
    for i in range(1, max_outliers + 1):
        if verbose:
            logger.info(i, '/', max_outliers, ' completed')

        if not data.mad():
            break

        if not one_tail:
            ares = abs(data - data.median())
        elif upper_tail:
            ares = data - data.median()
        else:
            ares = data.median() - data

        ares = ares / data.mad()

        tmp_anom_index = ares[ares.values == ares.max()].index
        cand = pd.Series(data.loc[tmp_anom_index], index=tmp_anom_index)

        data.drop(tmp_anom_index, inplace=True)

        # Compute critical value.
        p = 1 - alpha / (n - i + 1) if one_tail else (1 -
                                                      alpha / (2 * (n - i + 1)))
        t = sp.stats.t.ppf(p, n - i - 1)
        lam = t * (n - i) / np.sqrt((n - i - 1 + t ** 2) * (n - i + 1))
        if ares.max() > lam:
            R_idx = R_idx.append(cand)

    return {
        'anoms': R_idx,
        'stl': smoothed
    }


"is" with a literal. Did you mean "=="?


"is" with a literal. Did you mean "=="?


"is" with a literal. Did you mean "=="?



In [65]:
filled_df.groupby(GROUP)["DUPLICATES"].apply(anomaly_detect_ts)

AttributeError: 'Series' object has no attribute 'mad'