# Secondary data - Anomaly detection (Scenario 2)

In [1]:
from ckpackages import azsql         #Custom Package for Cavinkare
from fbprophet import Prophet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from dateutil.relativedelta import relativedelta
%matplotlib inline

# Data Import

### Azure DW

Extracting data for **top 20 distributors** of raw data with **interval width is 80%** and Anomaly importance is greater than 40%

In [2]:
Query1 = """
            select
                *
                ,substring(Product_Hierarchy,1,8) as 'Code'
                ,right(distcode,10) as 'Primary_Distcode'
            from
                [dbo].[V_AN_PC_UNDERCUTTING_SEC]
            where
                -- right(distcode,10) in
                --    ('0002003037','0002002987','0002002748','0002003025','0002003023',
                --     '0002002833','0002002838','0002002742','0002002898','0002002771',
                --     '0002002835','0002002712','0002002745','0002002968','0002002909',
                --     '0002002992','0002002795','0002002908','0002002973','0002002654')
            -- and
                substring(Product_Hierarchy,1,8) in ('06131253');
         """

### Secondary Data Input

In [None]:
sec_data = azsql.callstatement(Query1)

In [None]:
sec_data.head(2)

### Primary Data Output

In [152]:
primary_output = pd.read_excel(r"D:\Analytics\Undercutting\Azure\Output\Raw_indica_east_80,90,95.xlsx"
                               ,sheet_name = "Sheet1"
                               ,converters ={"Distributor_Code" : str,
                                             "Code"             : str})

In [153]:
primary_output.head(2)

Unnamed: 0,Distributor_Code,Code,Invoice date,Quantity,Yhat_80,Yhat_lower_80,Yhat_upper_80,Anomaly_80,Importance_80,Yhat_90,Yhat_lower_90,Yhat_upper_90,Anomaly_90,Importance_90,Yhat_95,Yhat_lower_95,Yhat_upper_95,Anomaly_95,Importance_95
0,2002711,6131253,2020-03-01,1,31.501735,9.124488,56.076439,-1,0.890405,31.501735,4.666851,59.75571,-1,0.785723,31.501735,-2.950335,66.34216,0,0.0
1,2002976,6131253,2018-08-01,1,21.938231,5.84842,36.956259,-1,0.829014,21.938231,1.518948,44.035301,-1,0.34165,21.938231,-3.431265,47.273501,0,0.0


In [289]:
anomaly_dist             = primary_output[  (primary_output["Importance_80"] >= 0.4) 
                                          & (primary_output["Anomaly_80"]    == 1)
                                          & (primary_output["Invoice date"]   > '2019-04-01')]

anomaly_dist.loc[:,"Quantity"] = anomaly_dist.loc[:,"Quantity"]*384

anomaly_dist = anomaly_dist.loc[:,("Distributor_Code"
                                    ,"Code"
                                    ,"Invoice date"
                                    ,"Quantity"
                                    ,"Anomaly_80"
                                    ,"Importance_80")].reset_index(drop = True).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [290]:
anomaly_dist

Unnamed: 0,Distributor_Code,Code,Invoice date,Quantity,Anomaly_80,Importance_80
0,2003037,6131253,2020-05-01,24960,1,0.527845
1,2002987,6131253,2020-01-01,4224,1,0.521954
2,2003025,6131253,2020-06-01,65280,1,0.502965
3,2002742,6131253,2020-06-01,76800,1,0.469845
4,2002898,6131253,2019-06-01,8448,1,0.464479
5,2002835,6131253,2020-06-01,48384,1,0.460172
6,2002968,6131253,2020-05-01,26880,1,0.43432
7,2002909,6131253,2019-06-01,4224,1,0.434015
8,2002992,6131253,2019-07-01,5760,1,0.412889
9,2002973,6131253,2019-05-01,6528,1,0.406886


In [173]:
Dist_Retailer     = sec_data[["Primary_Distcode","Actdte","totqty"]]
Dist_Retailer     = Dist_Retailer.groupby(['Primary_Distcode','Actdte'])['totqty'].sum().reset_index()

In [174]:
print("We are analysing the top {} distributors anomaly points with secondary data"
      .format(Dist_Retailer["Primary_Distcode"].nunique()))

We are analysing the top 20 distributors anomaly points with secondary data


In [245]:
Dist_Retailer[(Dist_Retailer["Primary_Distcode"] == anomaly_dist.iloc[0,0]) 
              & (Dist_Retailer["Actdte"] >= anomaly_dist.iloc[0,2]) 
              & (Dist_Retailer["Actdte"] <= anomaly_dist.iloc[0,2] + relativedelta(months=1))
             ]["totqty"].sum()

20976.0

In [291]:
for index,rows in anomaly_dist.iterrows():
#     print (index) #Index
#     print(rows[0]) #distcode
#     print(rows[2]) #invoice date
    anomaly_dist.loc[index,"Sum of Quantity sold to retailers in next 60 days"] = \
                Dist_Retailer[(Dist_Retailer["Primary_Distcode"] == anomaly_dist.iloc[index,0]) \
                            & (Dist_Retailer["Actdte"]           >= anomaly_dist.iloc[index,2]) \
                            & (Dist_Retailer["Actdte"]           <= anomaly_dist.iloc[index,2] + relativedelta(months=1))]\
                    ["totqty"].sum()

In [294]:
anomaly_dist["Difference"] = np.where((anomaly_dist["Quantity"] > anomaly_dist["Sum of Quantity sold to retailers in next 60 days"])
                                                 ,anomaly_dist["Quantity"] - anomaly_dist["Sum of Quantity sold to retailers in next 60 days"]
                                                 ,'NaN')

In [306]:
anomaly_dist["Percent of goods sold to retailers from distributors"] = np.where((anomaly_dist["Quantity"] > anomaly_dist["Sum of Quantity sold to retailers in next 60 days"])
                ,1-((anomaly_dist["Quantity"] - anomaly_dist["Sum of Quantity sold to retailers in next 60 days"])/anomaly_dist["Quantity"])
                ,'NaN')

In [307]:
anomaly_dist

Unnamed: 0,Distributor_Code,Code,Invoice date,Quantity,Anomaly_80,Importance_80,Sum of Quantity sold to retailers in next 60 days,Difference,Percent of goods sold to retailers from distributors
0,2003037,6131253,2020-05-01,24960,1,0.527845,20976.0,3984.0,0.8403846153846154
1,2002987,6131253,2020-01-01,4224,1,0.521954,5376.0,Nan,
2,2003025,6131253,2020-06-01,65280,1,0.502965,25344.0,39936.0,0.388235294117647
3,2002742,6131253,2020-06-01,76800,1,0.469845,81408.0,Nan,
4,2002898,6131253,2019-06-01,8448,1,0.464479,11136.0,Nan,
5,2002835,6131253,2020-06-01,48384,1,0.460172,40704.0,7680.0,0.8412698412698413
6,2002968,6131253,2020-05-01,26880,1,0.43432,38016.0,Nan,
7,2002909,6131253,2019-06-01,4224,1,0.434015,4470.0,Nan,
8,2002992,6131253,2019-07-01,5760,1,0.412889,5760.0,Nan,
9,2002973,6131253,2019-05-01,6528,1,0.406886,6912.0,Nan,
