# Anomaly Detection

In [1]:
import  pandas as pd
import numpy as np
import scipy.stats as ss

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
data = pd.read_excel(r"E:\0001_Studies\Undercutting\AnomalyDetection_CorrectDataset.xlsx")

In [4]:
data.shape

(13353, 34)

In [5]:
data.head(2)

Unnamed: 0,Distribution Channel,Division,Sold-To Party,Customer Group 3,Price List Type,Region Order,Plant,Material,Date,Sales,...,Gross,Gross.1,Sales cost,Sales cost.1,Net,Net.1,RtnsQty,RtnsQty.1,Rtns (cst),Rtns (cst).1
0,GT General Trade,PC Personal Care,2000001 N.CT. AGENCIES,006 GT,02 Super Stockist,TN TN,PYC1 CAVINKARE PVT LTD,KH05D8HWP02R KARTHIKA HERBAL POWDER 5.8 ...,10.07.2019,573752.23,...,5040.0,KG,272536.0,INR,4640.0,KG,0.0,CV,0.0,INR
1,GT General Trade,PC Personal Care,2000001 N.CT. AGENCIES,006 GT,02 Super Stockist,TN TN,PYC1 CAVINKARE PVT LTD,KH05D8HWP02R KARTHIKA HERBAL POWDER 5.8 ...,16.07.2019,215157.01,...,1890.0,KG,102201.0,INR,1740.0,KG,0.0,CV,0.0,INR


In [6]:
data.columns

Index(['Distribution Channel', 'Division', 'Sold-To Party', 'Customer Group 3',
       'Price List Type', 'Region Order', 'Plant', 'Material', 'Date', 'Sales',
       'Sales.1', 'Returns', 'Cred.Memos', 'Cred.Memos.1', 'CM: net 1',
       'CM: net 1.1', 'Net Value', 'Net Value.1', 'Tax', 'Tax.1', 'Bill. Qty',
       'Bill. Qty.1', 'CredMemQty', 'CredMemQty.1', 'Gross', 'Gross.1',
       'Sales cost', 'Sales cost.1', 'Net', 'Net.1', 'RtnsQty', 'RtnsQty.1',
       'Rtns (cst)', 'Rtns (cst).1'],
      dtype='object')

In [7]:
data["Date"] = pd.to_datetime(data["Date"],format='%d.%m.%Y')

In [8]:
temp                   =  data["Material"].str.split("       ",expand=True)
data["Prod_Code"]      = temp[0]
data["Prod_Name"]      = temp[1]
data["New date"]       = data["Date"].astype("datetime64[ns]")
data["Month"]          = data["New date"].dt.strftime('%B')

In [9]:
dist_data = data[["Sold-To Party","New date","Prod_Name","Bill. Qty"]]

In [10]:
dist_data.shape

(13353, 4)

In [11]:
dist_data.head(2)

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty
0,2000001 N.CT. AGENCIES,2019-07-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,400.0
1,2000001 N.CT. AGENCIES,2019-07-16,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,150.0


In [12]:
rmv_single_rows = dist_data.groupby(["Sold-To Party","Prod_Name"]).agg(Count = ("Prod_Name","count")).reset_index()
rmv_single_rows

Unnamed: 0,Sold-To Party,Prod_Name,Count
0,2000001 N.CT. AGENCIES,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,31
1,2000001 N.CT. AGENCIES,KARTHIKA HERBAL POWDER 50GM 180 PCS,10
2,2000001 N.CT. AGENCIES,KARTHIKA HERBAL POWDER CONT 180G 80PCS,19
3,2000001 N.CT. AGENCIES,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,9
4,2000001 N.CT. AGENCIES,MEERA ADVANCE HW POWDER 5.6 GM 1350 PCS,11
...,...,...,...
2224,2007340 SVP AGENCIES,MEERA ADVANCE HW POWDER 5.6 GM 1350 PCS,1
2225,2007340 SVP AGENCIES,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1
2226,2007345 MAHESH AGENCIES,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,1
2227,2007345 MAHESH AGENCIES,KARTHIKA HERBAL POWDER CONT 180G 80PCS,1


In [13]:
len(rmv_single_rows[rmv_single_rows["Count"] == 1])

387

In [14]:
dist_data_clean = pd.merge(dist_data,rmv_single_rows,on = ["Sold-To Party","Prod_Name"],how = "left")
dist_data_clean = dist_data_clean[dist_data_clean["Count"] != 1]
dist_data_clean.shape

(12966, 5)

In [15]:
print("So totally {a} rows will be dropped from total of {b} rows. Because it has only 1 data".format(a = len(rmv_single_rows[rmv_single_rows["Count"] == 1]), b= len(dist_data)))

So totally 387 rows will be dropped from total of 13353 rows. Because it has only 1 data


In [16]:
dist_data_clean.head(2)

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty,Count
0,2000001 N.CT. AGENCIES,2019-07-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,400.0,31
1,2000001 N.CT. AGENCIES,2019-07-16,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,150.0,31


In [17]:
# dist_data_groupby1.to_excel(r"D:\Analytics\Undercutting\Sample data\Outfiles\dist_data_groupby1.xlsx",index = False)

In [17]:
dist_data_descriptive = dist_data_clean.groupby(["Sold-To Party","Prod_Name"]).agg(Mean = ("Bill. Qty","mean"), 
                                           stddev = ("Bill. Qty","std"),Median =("Bill. Qty","median")).reset_index()
dist_data_descriptive.isna().sum()

Sold-To Party    0
Prod_Name        0
Mean             0
stddev           0
Median           0
dtype: int64

In [18]:
dist_data_descriptive

Unnamed: 0,Sold-To Party,Prod_Name,Mean,stddev,Median
0,2000001 N.CT. AGENCIES,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,232.322581,233.914427,192.0
1,2000001 N.CT. AGENCIES,KARTHIKA HERBAL POWDER 50GM 180 PCS,6.100000,11.994906,2.0
2,2000001 N.CT. AGENCIES,KARTHIKA HERBAL POWDER CONT 180G 80PCS,3.789474,3.536774,3.0
3,2000001 N.CT. AGENCIES,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,24.444444,32.152415,11.0
4,2000001 N.CT. AGENCIES,MEERA ADVANCE HW POWDER 5.6 GM 1350 PCS,73.909091,53.380623,60.0
...,...,...,...,...,...
1837,2007226 ADITHYAA ENTERPRISES,KARTHIKA HERBAL POWDER CONT 180G 80PCS,2.500000,2.121320,2.5
1838,2007226 ADITHYAA ENTERPRISES,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,2.000000,0.000000,2.0
1839,2007226 ADITHYAA ENTERPRISES,MEERA ADVANCE HW POWDER 5.6 GM 1350 PCS,8.250000,4.787136,7.0
1840,2007226 ADITHYAA ENTERPRISES,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,2.500000,1.914854,2.0


In [19]:
zscore_data = pd.merge(dist_data_clean, dist_data_descriptive, on = ["Sold-To Party","Prod_Name"], how = 'left')

In [20]:
# Test Data
zscore_data[(zscore_data["Sold-To Party"] == "2000001    N.CT. AGENCIES") & (zscore_data["Prod_Name"] == "KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS")].head(2)

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty,Count,Mean,stddev,Median
0,2000001 N.CT. AGENCIES,2019-07-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,400.0,31,232.322581,233.914427,192.0
1,2000001 N.CT. AGENCIES,2019-07-16,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,150.0,31,232.322581,233.914427,192.0


In [21]:
zscore_data.isna().sum()
# zscore_data.dropna(axis = 0, inplace = True)

Sold-To Party    0
New date         0
Prod_Name        0
Bill. Qty        0
Count            0
Mean             0
stddev           0
Median           0
dtype: int64

# Z Score Calculation

In [22]:
zscore_data['z-score']= (zscore_data['Bill. Qty'] - zscore_data['Mean'])/zscore_data['stddev']

In [23]:
# Test Data
zscore_data[(zscore_data["Sold-To Party"] == "2000001    N.CT. AGENCIES") & (zscore_data["Prod_Name"] == "KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS")].tail(2)

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty,Count,Mean,stddev,Median,z-score
76,2000001 N.CT. AGENCIES,2019-09-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,30.0,31,232.322581,233.914427,192.0,-0.864943
77,2000001 N.CT. AGENCIES,2019-09-30,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,439.0,31,232.322581,233.914427,192.0,0.88356


In [24]:
zscore_data.sort_values(by=['Sold-To Party','Prod_Name'], ascending=False,inplace=True)
zscore_data

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty,Count,Mean,stddev,Median,z-score
12964,2007255 SRI NARASIMHA AGENCIES,2019-08-31,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,2.0,2,1.000000,1.414214,1.0,0.707107
12965,2007255 SRI NARASIMHA AGENCIES,2019-09-18,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,0.0,2,1.000000,1.414214,1.0,-0.707107
12953,2007226 ADITHYAA ENTERPRISES,2019-08-28,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1.0,4,2.500000,1.914854,2.0,-0.783349
12954,2007226 ADITHYAA ENTERPRISES,2019-09-10,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1.0,4,2.500000,1.914854,2.0,-0.783349
12955,2007226 ADITHYAA ENTERPRISES,2019-09-17,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,3.0,4,2.500000,1.914854,2.0,0.261116
...,...,...,...,...,...,...,...,...,...
73,2000001 N.CT. AGENCIES,2019-08-14,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,50.0,31,232.322581,233.914427,192.0,-0.779441
74,2000001 N.CT. AGENCIES,2019-09-03,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,104.0,31,232.322581,233.914427,192.0,-0.548588
75,2000001 N.CT. AGENCIES,2019-09-09,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,150.0,31,232.322581,233.914427,192.0,-0.351935
76,2000001 N.CT. AGENCIES,2019-09-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,30.0,31,232.322581,233.914427,192.0,-0.864943


In [26]:
zscore_data

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty,Count,Mean,stddev,z-score
12964,2007255 SRI NARASIMHA AGENCIES,2019-08-31,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,2.0,2,1.000000,1.414214,0.707107
12965,2007255 SRI NARASIMHA AGENCIES,2019-09-18,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,0.0,2,1.000000,1.414214,-0.707107
12953,2007226 ADITHYAA ENTERPRISES,2019-08-28,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1.0,4,2.500000,1.914854,-0.783349
12954,2007226 ADITHYAA ENTERPRISES,2019-09-10,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1.0,4,2.500000,1.914854,-0.783349
12955,2007226 ADITHYAA ENTERPRISES,2019-09-17,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,3.0,4,2.500000,1.914854,0.261116
...,...,...,...,...,...,...,...,...
73,2000001 N.CT. AGENCIES,2019-08-14,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,50.0,31,232.322581,233.914427,-0.779441
74,2000001 N.CT. AGENCIES,2019-09-03,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,104.0,31,232.322581,233.914427,-0.548588
75,2000001 N.CT. AGENCIES,2019-09-09,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,150.0,31,232.322581,233.914427,-0.351935
76,2000001 N.CT. AGENCIES,2019-09-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,30.0,31,232.322581,233.914427,-0.864943


In [25]:
zscore_data['MAD Bill. Qty'] = np.absolute(zscore_data['Bill. Qty'] - zscore_data['Median'])

In [27]:
zscore_data['Mod Z score'] = 0.6745 * zscore_data['MAD Bill. Qty']/zscore_data['Median']

In [28]:
zscore_data.sort_values(by=['Sold-To Party','Prod_Name'], ascending=False,inplace=True)
zscore_data

Unnamed: 0,Sold-To Party,New date,Prod_Name,Bill. Qty,Count,Mean,stddev,Median,z-score,MAD Bill. Qty,Mod Z score
12964,2007255 SRI NARASIMHA AGENCIES,2019-08-31,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,2.0,2,1.000000,1.414214,1.0,0.707107,1.0,0.674500
12965,2007255 SRI NARASIMHA AGENCIES,2019-09-18,MEERA ADVANCE HW POWDER 120GM CONT 60 PC,0.0,2,1.000000,1.414214,1.0,-0.707107,1.0,0.674500
12953,2007226 ADITHYAA ENTERPRISES,2019-08-28,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1.0,4,2.500000,1.914854,2.0,-0.783349,1.0,0.337250
12954,2007226 ADITHYAA ENTERPRISES,2019-09-10,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,1.0,4,2.500000,1.914854,2.0,-0.783349,1.0,0.337250
12955,2007226 ADITHYAA ENTERPRISES,2019-09-17,MEERA ADVANCE HW POWDER 80GM REFILL 50 P,3.0,4,2.500000,1.914854,2.0,0.261116,1.0,0.337250
...,...,...,...,...,...,...,...,...,...,...,...
73,2000001 N.CT. AGENCIES,2019-08-14,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,50.0,31,232.322581,233.914427,192.0,-0.779441,142.0,0.498849
74,2000001 N.CT. AGENCIES,2019-09-03,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,104.0,31,232.322581,233.914427,192.0,-0.548588,88.0,0.309146
75,2000001 N.CT. AGENCIES,2019-09-09,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,150.0,31,232.322581,233.914427,192.0,-0.351935,42.0,0.147547
76,2000001 N.CT. AGENCIES,2019-09-10,KARTHIKA HERBAL POWDER 5.8 GM 2000 PCS,30.0,31,232.322581,233.914427,192.0,-0.864943,162.0,0.569109


In [31]:
all_zscores = zscore_data[['Sold-To Party', 'New date', 'Prod_Name', 'Bill. Qty', 'Count','z-score','Mod Z score']]

In [32]:
all_zscores.to_csv(r'E:\0001_Studies\Undercutting\output\all_zscores.csv')