In [8]:
import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from Trax.Cloud.Services.Connector.Logger import LoggerInitializer
from Trax.Utils.Conf.Configuration import Config
from Trax.Data.Projects.Connector import ProjectConnector
from Trax.Cloud.Services.Connector.Keys import DbUsers
from Trax.Cloud.Services.Connector.Factory import BigQueryFactory


LoggerInitializer.init('Diageo Accuracy Test')
Config.set_access_mode(Config.CROSS_CLOUD_ACCESS_MODE)
#Config.set_env_and_cloud(Config.PROD, Config.AWS)
Config.set_env_and_cloud(Config.PROD, Config.GCP)

pd.options.display.max_columns = None
pd.options.display.max_rows = None



In [12]:
bq_client = BigQueryFactory.get_bigquery_client('trax-ortal-prod')

In [3]:
sql_probes = """ SELECT *
                        from (SELECT timestamp,event_name,session_uid,wave_type,wave_uid,
                                    project_name,probe_id,data_subject,flavor,accuracy_logic_type,
                                    fr.masking_box,fr.majority_product_fk,
                                    fr.voting_result,fr.voting_result_linkage, fr.majority_value,
                                    fr.majority_is_promotion, fr.production_tag_source_group,
                                    fr.production_tag_source, fr.production_tag_identified,
                                    fr.production_product_fk, fr.TP_value, fr.FP_value, fr.FN_value,
                                    fr.INCONCLUSIVE_value, fr.TP_promotion,fr.FP_promotion,fr.FN_promotion,
                                    fr.TP_linkage,fr.FP_linkage,fr.FN_linkage,fr.INCONCLUSIVE_linkage,
                                    dense_rank() over(partition by event_name, project_name, probe_id, 
                                                                   fr.production_tag_source_group order by timestamp desc) as rownum
                                FROM `trax-ortal-prod.raw.factory_accuracy_price`
                                join unnest (additional_event_data) fr
                                WHERE DATE(timestamp) > "2021-11-01")
                    where rownum = 1
                    """
df = bq_client.run_query(sql_probes).to_dataframe()

[34m2021-11-28 13:23:26,788 - Diageo Accuracy Test - 2003555 - INFO - Run query with job_id=e3f4e2f8-84fd-44d2-a288-fb8314945a62 [0m
[34m2021-11-28 13:23:36,308 - Diageo Accuracy Test - 2003555 - INFO - Query job=e3f4e2f8-84fd-44d2-a288-fb8314945a62 finished {'total_mb_billed': 252, 'total_mb_processed': 252}[0m


In [None]:
df.head()

In [7]:
df.accuracy_logic_type.unique()

array([u'internal'], dtype=object)

In [8]:
df.data_subject.unique()

array([u'Price'], dtype=object)

In [9]:
df.flavor.unique()

array([u'default'], dtype=object)

In [10]:
df.wave_type.unique()

array([u'voting', u'pricing_voting'], dtype=object)

In [11]:
t1 = df.groupby(['timestamp','project_name','probe_id','majority_product_fk']).agg({'voting_result':'count'}).reset_index()

In [12]:
t1[t1['voting_result']>1]

Unnamed: 0,timestamp,project_name,probe_id,majority_product_fk,voting_result
39,2021-11-02 01:04:41.745355+00:00,jnjanz,664187,5510.0,3
41,2021-11-02 01:04:41.745355+00:00,jnjanz,664187,6750.0,11
349,2021-11-03 05:46:28.618424+00:00,danonear,38552,349.0,2
420,2021-11-03 06:24:40.297582+00:00,danonear,40848,410.0,2
441,2021-11-03 06:27:56.353324+00:00,danonear,41332,365.0,2
455,2021-11-03 08:49:42.941671+00:00,diageobenelux,836946,498.0,14
456,2021-11-03 08:49:42.941671+00:00,diageobenelux,836946,524.0,6
458,2021-11-03 08:49:42.941671+00:00,diageobenelux,836946,631.0,2
462,2021-11-03 08:49:42.941671+00:00,diageobenelux,836946,2897.0,3
612,2021-11-04 00:57:56.722732+00:00,nestleus,10896076,67300.0,2


In [13]:
#df[(df['probe_id']==38552) & (df['project_name']=='danonear')&(df['majority_product_fk']==349.0)]
df[(df['probe_id']==38552) & (df['project_name']=='danonear')]

Unnamed: 0,timestamp,event_name,session_uid,wave_type,wave_uid,project_name,probe_id,data_subject,flavor,accuracy_logic_type,masking_box,majority_product_fk,voting_result,voting_result_linkage,majority_value,majority_is_promotion,production_tag_source_group,production_tag_source,production_tag_identified,production_product_fk,TP_value,FP_value,FN_value,INCONCLUSIVE_value,TP_promotion,FP_promotion,FN_promotion,TP_linkage,FP_linkage,FN_linkage,INCONCLUSIVE_linkage,rownum
1041008,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752320,,majority,,114.0,0.0,QAT,QAT,1,349.0,1,0,0,0,1,0,0,0,0,0,0,1
1041009,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752329,,majority,,1525.5,0.0,QAT,QAT,1,353.0,1,0,0,0,1,0,0,0,0,0,0,1
1041010,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752321,349.0,majority,majority,114.0,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,0,0,1,0,1
1041011,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752322,349.0,majority,majority,114.0,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,0,0,1,0,1
1041012,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752325,353.0,majority,majority,1525.5,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,0,0,1,0,1
1041013,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752324,,majority,,1525.5,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,1,0,0,0,1
1041014,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752323,,majority,,1525.5,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,1,0,0,0,1
1041015,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752328,,majority,,1525.5,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,1,0,0,0,1
1041016,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752327,,majority,,1525.5,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,1,0,0,0,1
1041017,2021-11-03 05:46:28.618424+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,6b58be2b-2a56-45aa-b533-22b2be91d6b4,voting,f6866b2f-0645-432b-b780-f653d2d07abd,danonear,38552,Price,default,internal,752326,,majority,,1525.5,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,1,0,0,0,1


In [14]:
df.TP_linkage.sum()

30342

In [15]:
t0=df.groupby(['majority_is_promotion','majority_product_fk']).agg({'masking_box':'count'}).reset_index()

In [16]:
t3 = df[(df['majority_is_promotion']==1)]
t3.probe_id.count()

149304

In [17]:
t3.TP_linkage.sum()

6534

In [18]:
t3.TP_promotion.sum()

133139

In [19]:
t1 = df[(df['majority_is_promotion']==1)&(df['majority_product_fk'].isnull())]
t1.probe_id.count()

142994

In [20]:
t2 = df[(df['majority_is_promotion']==1)] 
t2.head()

Unnamed: 0,timestamp,event_name,session_uid,wave_type,wave_uid,project_name,probe_id,data_subject,flavor,accuracy_logic_type,masking_box,majority_product_fk,voting_result,voting_result_linkage,majority_value,majority_is_promotion,production_tag_source_group,production_tag_source,production_tag_identified,production_product_fk,TP_value,FP_value,FN_value,INCONCLUSIVE_value,TP_promotion,FP_promotion,FN_promotion,TP_linkage,FP_linkage,FN_linkage,INCONCLUSIVE_linkage,rownum
382,2021-11-02 02:40:05.895112+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,2dac7280-7a89-4ae6-ba65-d7db13a4b091,pricing_voting,9e8859f6-1c19-470a-9f16-4008e0b9e523,beiersdorfchl,533706,Price,default,internal,2070844,,majority,inconclusive,1995.0,1.0,tag_source_not_relevant,tag_source_not_relevant,0,,0,0,1,0,0,0,1,0,0,0,1,1
501,2021-11-06 11:53:03.051643+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,1bca30fe-3c80-44e9-b69f-e18adf0e24c4,pricing_voting,18b46c73-7172-4d2d-a006-68428babde11,beiersdorfchl,536997,Price,default,internal,2109076,,majority,inconclusive,1990.0,1.0,QAT,QAT,1,1181.0,1,0,0,0,1,0,0,0,0,0,1,1
546,2021-11-09 07:05:20.804784+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,38d43762-1aef-4c60-90d1-81a17b89067b,pricing_voting,a281ddf8-fc71-4353-a271-3db828c4320b,beiersdorfchl,543104,Price,default,internal,2121609,,majority,inconclusive,1645.0,1.0,QAT,QAT,1,1190.0,1,0,0,0,1,0,0,0,0,0,1,1
547,2021-11-09 07:05:20.804784+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,38d43762-1aef-4c60-90d1-81a17b89067b,pricing_voting,a281ddf8-fc71-4353-a271-3db828c4320b,beiersdorfchl,543104,Price,default,internal,2121613,,majority,inconclusive,1645.0,1.0,QAT,QAT,1,1190.0,1,0,0,0,1,0,0,0,0,0,1,1
585,2021-11-11 03:14:54.568492+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,bc1f19d3-1961-4860-9ffa-3f473b0a9302,pricing_voting,a281ddf8-fc71-4353-a271-3db828c4320b,beiersdorfchl,547652,Price,default,internal,2122116,,majority,inconclusive,3450.0,1.0,Engine,Engine,1,1014.0,1,0,0,0,1,0,0,0,0,0,1,1


In [21]:
df[(df['probe_id']==8731043) & (df['project_name']=='diageoru')]

Unnamed: 0,timestamp,event_name,session_uid,wave_type,wave_uid,project_name,probe_id,data_subject,flavor,accuracy_logic_type,masking_box,majority_product_fk,voting_result,voting_result_linkage,majority_value,majority_is_promotion,production_tag_source_group,production_tag_source,production_tag_identified,production_product_fk,TP_value,FP_value,FN_value,INCONCLUSIVE_value,TP_promotion,FP_promotion,FN_promotion,TP_linkage,FP_linkage,FN_linkage,INCONCLUSIVE_linkage,rownum
219907,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086181,,inconclusive,inconclusive,,,QAT,QAT,1,821.0,0,0,0,1,0,0,0,0,0,0,1,1
219908,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086177,,majority,inconclusive,1116.0,1.0,QAT,QAT,1,29.0,1,0,0,0,1,0,0,0,0,0,1,1
219909,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086180,,majority,inconclusive,999.0,1.0,QAT,QAT,1,2142.0,1,0,0,0,1,0,0,0,0,0,1,1
219910,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086206,,majority,inconclusive,2342.0,0.0,QAT,QAT,1,481.0,1,0,0,0,1,0,0,0,0,0,1,1
219911,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086213,,majority,inconclusive,1217.0,0.0,QAT,QAT,1,437.0,1,0,0,0,1,0,0,0,0,0,1,1
219912,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086196,,majority,inconclusive,1217.0,0.0,QAT,QAT,1,,1,0,0,0,1,0,0,0,0,0,1,1
597728,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086172,,majority,inconclusive,1107.99,0.0,Engine,Engine,1,881.0,1,0,0,0,1,0,0,0,0,0,1,1
597729,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086173,,majority,inconclusive,799.0,1.0,Engine,Engine,1,567.0,1,0,0,0,1,0,0,0,0,0,1,1
597730,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086174,,majority,inconclusive,889.0,1.0,Engine,Engine,1,882.0,1,0,0,0,1,0,0,0,0,0,1,1
597731,2021-11-23 20:10:22.831549+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,70534a02-fd35-4087-bcdd-4d234f219de0,pricing_voting,6423d92c-9235-4927-b11f-8dea4e71170a,diageoru,8731043,Price,default,internal,8086175,,majority,inconclusive,2433.99,0.0,Engine,Engine,1,567.0,1,0,0,0,1,0,0,0,0,0,1,1


In [None]:
sql = """ select *
from trax-ortal-prod.thelake.rds_product
"""
rds_product = bq_client.run_query(sql).to_dataframe()

In [None]:
rds_product.head()

In [None]:
rds_product[(rds_product['project_name']=='danonear')&(rds_product['product_pk']==349.0)] #5510

In [24]:
sql_reg = """select * 
        from
        (select
                event_name as recognition_event_name,
                timestamp as recognition_event_timestamp, 
                wave_type as recognition_wave_type,
                project_name,
                probe_id,
                probe_creation_time, 
                data_subject,
                if(data_subject = 'Price', true, false) as is_price_recognition_probe,
                e.tag_source_group,    
                e.tag_source,
                e.product_fk as product_pk,
                e.facings,--not same with sku level
                e.total_price_tags,
                e.price_value,
                e.is_promotion,
                dense_rank() over(partition by event_name, project_name, probe_id, e.tag_source_group order by timestamp desc) as rownum
            from `trax-ortal-prod.raw.factory_recognition`
            cross join unnest(additional_event_data) as e
            where 1=1
                and date(timestamp) > '2021-10-01'
                and wave_type in ('primary' , 'offline_pricing', 'category_expert')
                and e.tag_source_group in ('Engine', 'QAT') 
                and data_subject = 'Price'
                and flavor = 'default'
        ) 
        where 1=1
        and rownum = 1"""

In [None]:
df_recognition = bq_client.run_query(sql_reg).to_dataframe()

[34m2021-11-25 14:40:03,154 - Diageo Accuracy Test - 1353408 - INFO - Run query with job_id=5fcae00e-bbd5-45b4-ac89-d71aa27657f7 [0m
[34m2021-11-25 14:40:21,311 - Diageo Accuracy Test - 1353408 - INFO - Query job=5fcae00e-bbd5-45b4-ac89-d71aa27657f7 finished {'total_mb_billed': 25510, 'total_mb_processed': 25510}[0m


KeyboardInterrupt: 

In [None]:
df_recognition.head()

In [None]:
df_recognition[(df_recognition['probe_id']==664187) & (df_recognition['project_name']=='jnjanz')&(df_recognition['product_pk']==5510.0)]

In [8]:
sql_probes = """ SELECT *
                        from (SELECT timestamp,event_name,session_uid,wave_type,wave_uid,
                                    project_name,probe_id,data_subject,flavor,accuracy_logic_type,
                                    fr.masking_box,fr.majority_product_fk,
                                    fr.voting_result,fr.voting_result_linkage, fr.majority_value,
                                    fr.majority_is_promotion, fr.production_tag_source_group,
                                    fr.production_tag_source, fr.production_tag_identified,
                                    fr.production_product_fk, fr.TP_value, fr.FP_value, fr.FN_value,
                                    fr.INCONCLUSIVE_value, fr.TP_promotion,fr.FP_promotion,fr.FN_promotion,
                                    fr.TP_linkage,fr.FP_linkage,fr.FN_linkage,fr.INCONCLUSIVE_linkage,
                                    dense_rank() over(partition by event_name, project_name, probe_id, 
                                                                   fr.production_tag_source_group order by timestamp desc) as rownum
                                FROM `trax-ortal-prod.raw.factory_accuracy_price`
                                join unnest (additional_event_data) fr
                                WHERE DATE(timestamp) > "2021-11-01")
                    where rownum = 1
                    """
df = bq_client.run_query(sql_probes).to_dataframe()

[34m2021-11-25 14:53:48,827 - Diageo Accuracy Test - 1363868 - INFO - Run query with job_id=c040ea41-094f-491c-9662-dc29db3ce119 [0m
[34m2021-11-25 14:54:05,740 - Diageo Accuracy Test - 1363868 - INFO - Run query with job_id=012916ed-6d1e-4836-a3b7-339963f522fa [0m
[34m2021-11-25 14:54:17,831 - Diageo Accuracy Test - 1363868 - INFO - Run query with job_id=01430ca5-29d1-46ad-9775-e068ec526503 [0m
[34m2021-11-25 14:54:31,463 - Diageo Accuracy Test - 1363868 - INFO - Query job=01430ca5-29d1-46ad-9775-e068ec526503 finished {'total_mb_billed': 233, 'total_mb_processed': 232}[0m


In [27]:
fr_query = """
SELECT * FROM (
SELECT probe_id, wave_type, data_subject, session_uid, event_name, timestamp, project_name, 
additional_event_data.tag_source, additional_event_data.facings, additional_event_data.product_fk,
 additional_event_data.tag_source_group, additional_event_data.price_value, additional_event_data.is_promotion,
  flavor, dense_rank() over(partition by event_name, project_name, probe_id, 
                                                                   additional_event_data.tag_source_group order by timestamp desc) as rownum
   FROM `trax-ortal-prod.raw.factory_recognition`
   JOIN UNNEST(additional_event_data) additional_event_data
   WHERE DATE(timestamp) > "2021-11-01")
where 1=1
and rownum=1
LIMIT 5000

"""

fr = bq_client.run_query(fr_query).to_dataframe()

[34m2021-11-25 15:09:38,865 - Diageo Accuracy Test - 1363868 - INFO - Run query with job_id=248f44db-bf99-4c89-9d25-e2af323e342b [0m
[34m2021-11-25 15:09:48,901 - Diageo Accuracy Test - 1363868 - INFO - Query job=248f44db-bf99-4c89-9d25-e2af323e342b finished {'total_mb_billed': 12127, 'total_mb_processed': 12126}[0m


In [48]:
fr.shape

(5000, 15)

In [30]:
df.head()

Unnamed: 0,timestamp,event_name,session_uid,wave_type,wave_uid,project_name,probe_id,data_subject,flavor,accuracy_logic_type,masking_box,majority_product_fk,voting_result,voting_result_linkage,majority_value,majority_is_promotion,production_tag_source_group,production_tag_source,production_tag_identified,production_product_fk,TP_value,FP_value,FN_value,INCONCLUSIVE_value,TP_promotion,FP_promotion,FN_promotion,TP_linkage,FP_linkage,FN_linkage,INCONCLUSIVE_linkage,rownum
0,2021-11-08 05:05:32.641344+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,0332ab3d-df55-4699-83e7-8a93e8a7aaa6,voting,4be96aeb-1dfd-4dc9-b711-a196afb9b596,batuz,8664,Price,default,internal,101273,,inconclusive,inconclusive,25000.0,0.0,QAT,QAT,1,225.0,0,0,0,1,0,0,0,0,0,0,1,1
1,2021-11-08 05:05:32.641344+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,0332ab3d-df55-4699-83e7-8a93e8a7aaa6,voting,4be96aeb-1dfd-4dc9-b711-a196afb9b596,batuz,8664,Price,default,internal,101274,,inconclusive,inconclusive,7000.0,0.0,QAT,QAT,1,229.0,0,0,0,1,0,0,0,0,0,0,1,1
2,2021-11-08 05:05:32.641344+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,0332ab3d-df55-4699-83e7-8a93e8a7aaa6,voting,4be96aeb-1dfd-4dc9-b711-a196afb9b596,batuz,8664,Price,default,internal,101275,,inconclusive,inconclusive,6000.0,0.0,QAT,QAT,1,230.0,0,0,0,1,0,0,0,0,0,0,1,1
3,2021-11-08 05:05:32.641344+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,0332ab3d-df55-4699-83e7-8a93e8a7aaa6,voting,4be96aeb-1dfd-4dc9-b711-a196afb9b596,batuz,8664,Price,default,internal,101284,,inconclusive,inconclusive,,,QAT,QAT,1,160.0,0,0,0,1,0,0,0,0,0,0,1,1
4,2021-11-08 05:05:32.641344+00:00,PROBE_ACCURACY-DATA-SUMMARIZED,0332ab3d-df55-4699-83e7-8a93e8a7aaa6,voting,4be96aeb-1dfd-4dc9-b711-a196afb9b596,batuz,8664,Price,default,internal,101283,,inconclusive,inconclusive,,,QAT,QAT,1,156.0,0,0,0,1,0,0,0,0,0,0,1,1


In [35]:
fn_link = df[df.FN_linkage == 1] 
fn_link.shape

(3025, 32)

In [36]:
fp_link = df[df.FP_linkage == 1]
fp_link.shape

(1084, 32)

In [57]:
key_columns = ['probe_id', 'data_subject', 'event_name', 'project_name', 'flavor']

In [58]:
fn_merge = fn_link.merge(fr, on=key_columns, how='outer')

In [59]:
fn_merge.shape

(8025, 41)

In [5]:
fn_merge

NameError: name 'fn_merge' is not defined

In [49]:
fn_merge.columns

Index([                u'timestamp_x',                  u'event_name',
                       u'session_uid',                 u'wave_type_x',
                          u'wave_uid',                u'project_name',
                          u'probe_id',                u'data_subject',
                            u'flavor',         u'accuracy_logic_type',
                       u'masking_box',         u'majority_product_fk',
                     u'voting_result',       u'voting_result_linkage',
                    u'majority_value',       u'majority_is_promotion',
       u'production_tag_source_group',       u'production_tag_source',
         u'production_tag_identified',       u'production_product_fk',
                          u'TP_value',                    u'FP_value',
                          u'FN_value',          u'INCONCLUSIVE_value',
                      u'TP_promotion',                u'FP_promotion',
                      u'FN_promotion',                  u'TP_linkage',
      

In [60]:
x = fn_merge[['majority_value', 'voting_result', 'product_fk', 'majority_product_fk', 'production_product_fk']]

In [61]:
x[x.product_fk.notna()].sum()

majority_value                  0.0
voting_result                   0.0
product_fk               30837167.0
majority_product_fk             0.0
production_product_fk           0.0
dtype: float64

In [66]:
df.production_tag_source_group.unique()

array([u'QAT', u'tag_source_not_relevant', u'Engine'], dtype=object)

In [63]:
df.columns

Index([u'timestamp', u'event_name', u'session_uid', u'wave_type', u'wave_uid',
       u'project_name', u'probe_id', u'data_subject', u'flavor',
       u'accuracy_logic_type', u'masking_box', u'majority_product_fk',
       u'voting_result', u'voting_result_linkage', u'majority_value',
       u'majority_is_promotion', u'production_tag_source_group',
       u'production_tag_source', u'production_tag_identified',
       u'production_product_fk', u'TP_value', u'FP_value', u'FN_value',
       u'INCONCLUSIVE_value', u'TP_promotion', u'FP_promotion',
       u'FN_promotion', u'TP_linkage', u'FP_linkage', u'FN_linkage',
       u'INCONCLUSIVE_linkage', u'rownum'],
      dtype='object')

In [67]:
fr.tag_source_group.unique()

array([u'QAT', u'Engine', u'UNKNOWN'], dtype=object)

## JOIN BETWEEN factory_recognition and factory_accuracy_pricing

In [10]:
query = """

SELECT * FROM
(
SELECT *
            FROM (SELECT timestamp,event_name,session_uid,wave_type,wave_uid,
                        project_name,probe_id,data_subject,flavor,accuracy_logic_type,
                        data.masking_box,data.majority_product_fk,
                        data.voting_result,data.voting_result_linkage, data.majority_value,
                        data.majority_is_promotion, data.production_tag_source_group,
                        data.production_tag_source, data.production_tag_identified,
                        data.production_product_fk, data.TP_value, data.FP_value, data.FN_value,
                        data.INCONCLUSIVE_value, data.TP_promotion,data.FP_promotion,data.FN_promotion,
                        data.TP_linkage,data.FP_linkage,data.FN_linkage,data.INCONCLUSIVE_linkage,
                        dense_rank() over(partition by event_name, project_name, probe_id, 
                                                        data.production_tag_source_group order by timestamp desc) as rownum
                    FROM `trax-ortal-prod.raw.factory_accuracy_price`
                    join unnest (additional_event_data) data
                    WHERE DATE(timestamp) > "2021-11-15")
        where rownum = 1
) fp

full outer join

(
SELECT * FROM (
    SELECT 
        probe_id, wave_type, data_subject, session_uid, event_name, timestamp, project_name, 
        additional_event_data.tag_source, additional_event_data.facings, additional_event_data.product_fk,
        additional_event_data.tag_source_group, additional_event_data.price_value, additional_event_data.is_promotion,
        flavor, dense_rank() over(partition by event_name, project_name, probe_id, 
        additional_event_data.tag_source_group order by timestamp desc) as rownum
    FROM `trax-ortal-prod.raw.factory_recognition`
    JOIN UNNEST(additional_event_data) additional_event_data
    WHERE DATE(timestamp) > "2021-11-15")
    where 1=1
                and date(timestamp) > '2021-11-15'
                and wave_type in ('primary' , 'offline_pricing', 'category_expert')
                and tag_source_group in ('Engine', 'QAT') 
                and flavor = 'default'
                and rownum = 1
) fr

on 1=1
and fr.probe_id = fp.probe_id
and fr.project_name = fp.project_name
and fr.tag_source_group = fp.production_tag_source_group
and fr.product_fk = fp.production_product_fk
;
"""

In [None]:
joined = bq_client.query_to_dataframe(query)
joined.head()