## Anomaly detection

This is an attempt to perform [anomaly detection][] on contract data.

Note that unlike other notebooks in this repository, this one requires Anaconda and runs on Python 3.6.

[anomaly detection]: https://www.coursera.org/learn/machine-learning/lecture/C8IJp/algorithm

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(color_codes=True)

%matplotlib inline

In [2]:
MAX_XP = 20.0

XP_BUCKET_SIZE = 5

NUM_XP_BUCKETS = int(MAX_XP / XP_BUCKET_SIZE + 1)

hourly_prices = pd.read_csv('../data/hourly_prices.csv', index_col=False, thousands=',', usecols=[
    'Labor Category',
    'CurrentYearPricing',
    'Education',
    'MinExpAct',
    'Schedule'
])

hourly_prices.rename(columns={
    'Labor Category': 'name',
    'CurrentYearPricing': 'price',
    'Education': 'edu',
    'MinExpAct': 'minxp',
    'Schedule': 'schedule',
}, inplace=True)

hourly_prices.dropna(axis=0, how='any', inplace=True)

hourly_prices.minxp = hourly_prices.minxp.clip(upper=MAX_XP)

hourly_prices['xp_bucket'] = (hourly_prices.minxp / XP_BUCKET_SIZE).astype(int)

hourly_prices[:10]

Unnamed: 0,name,edu,minxp,schedule,price,xp_bucket
0,Project Manager,Bachelors,8.0,MOBIS,1013.38,1
1,Program Coordinator,Bachelors,1.0,MOBIS,992.56,0
3,Administrative Support,Bachelors,1.0,MOBIS,981.44,0
4,Associate,Masters,6.0,MOBIS,990.82,1
9,"AcquCenter Customer Site Installation, Additio...",,0.0,Consolidated,952.14,0
10,AcquTrak Remote Access Archival Functions Onlu...,,0.0,Consolidated,952.14,0
11,AcquTrak Remote Access Pre-Source Selection Ph...,,0.0,Consolidated,952.14,0
12,AcquTrak Remote Access Source Selection/Post A...,,0.0,Consolidated,952.14,0
13,Producer,High School,7.0,Consolidated,941.58,1
14,Video Editor,High School,7.0,Consolidated,910.19,1


In [3]:
NONE = 'None'
HIGH_SCHOOL = 'High School'
ASSOCIATES = 'Associates'
BACHELORS = 'Bachelors'
MASTERS = 'Masters'
PHD = 'Ph.D.'

EDU_LEVELS = [NONE, HIGH_SCHOOL, ASSOCIATES, BACHELORS, MASTERS, PHD]

def filtr(rows=hourly_prices, edu=None, xp_bucket=None, minxp=None, schedule=None):
    r = rows
    if edu is not None:
        r = r[r.edu == edu]
        if r.empty:
            raise ValueError(f"invalid edu: {edu}")
    if xp_bucket is not None:
        if xp_bucket < 0 or xp_bucket >= NUM_XP_BUCKETS:
            raise ValueError(f"invalid xp_bucket: {xp_bucket}")
        r = r[r.xp_bucket == xp_bucket]
    if minxp is not None:
        r = r[r.minxp >= minxp]
    if schedule is not None:
        r = r[r.schedule == schedule]
        if r.empty:
            raise ValueError(f"invalid schedule: {schedule}")
    return r


In [4]:
# TODO: Can we transform the dataset to make it more of a normal distribution?
# sns.distplot(filtr(edu=HIGH_SCHOOL, xp_bucket=0).price)

In [5]:
# sns.distplot(filtr(edu=PHD).price)

In [6]:
# https://pandas.pydata.org/pandas-docs/stable/advanced.html
edu_xp_index = pd.MultiIndex.from_product([EDU_LEVELS, range(NUM_XP_BUCKETS)], names=['edu', 'xp_bucket'])

In [7]:
def edu_xp_agg(func):
    return list(edu_xp_index.map(lambda x: filtr(edu=x[0], xp_bucket=x[1]).price.agg(func)))

In [8]:
edu_xp_aggs = pd.DataFrame(
    list(zip(edu_xp_agg('mean'), edu_xp_agg('std'))),
    index=edu_xp_index, columns=['mean', 'std']
)

edu_xp_aggs

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
edu,xp_bucket,Unnamed: 2_level_1,Unnamed: 3_level_1
,0,75.316285,117.029634
,1,78.572105,32.11644
,2,101.463333,28.809071
,3,138.421304,57.215975
,4,167.786,58.796919
High School,0,51.273426,25.665587
High School,1,73.448609,42.439284
High School,2,96.704766,39.218168
High School,3,124.680714,51.678122
High School,4,131.720052,49.826468


In [9]:
def gaussian(values, stds, means):
    return (1 / ((np.sqrt(2 * np.pi)) * stds)) * np.exp(-np.square(values - means) / (2 * np.square(stds)))

def probabilities():
    merged = hourly_prices.merge(edu_xp_aggs, left_on=['edu', 'xp_bucket'], right_index=True, how='left')
    return gaussian(hourly_prices.price, merged['std'], merged['mean'])

In [10]:
# hourly_prices[probabilities() < 0.00001]