# Data Exploration for Propensity Models

In this notebook, we connect to the query service on AEP, and examine the data that we have uploaded. 

This notebook is step 2 in the diagram below. We do the following steps:

- Connect to query service on AEP Staging (`targetpremiumqa6` tenant), using their PostgreSQL protocol
- Discover the schema of data, and explore a few rows
- Compute basic statistics
- Examine correlations among features, to inform feature creation

![exploration](../media/CME-PoC-Exploration.png)


## Start by running some test queries

In [None]:
!pip install aepp
!pip install psycopg2

In [None]:
%%time
cur.execute('''SELECT * FROM analytics_experience_events_v2_synthetic LIMIT 5''')
get_result_as_df(cur)

# Lets test some basic statistics

In [None]:
%%time
cur.execute("""
SELECT COUNT(_id) as "totalRows",  
       COUNT(DISTINCT identityMap['ECID'][0]['id']) as "distinctUsers" 
FROM analytics_experience_events_v2_synthetic""")

# Lets create a class that abstracts away some of the common operations to be performed against the query service to featurize data

In [3]:
import aepp
from aepp import queryservice
import pandas as pd
import json
from configparser import ConfigParser


class CMEPQSExplorer:
    """A class to abstract away some of the data explorations done against the query service for the end goal of
    feature generation"""

    def __init__(self, env: str):
        self.cme_config = ConfigParser()
        self.cme_config.read('./cme/config/cme_config.ini')
        self.source_table = self.cme_config.get(env,"source_table")
        aepp.configure(
            org_id=self.cme_config.get(env, 'ims_org_id'),
            tech_id=self.cme_config.get(env, "tech_acct_id"),
            secret=self.cme_config.get(env, "client_secret"),
            private_key_path="add private key path here",
            client_id=self.cme_config.get(env, "client_id")
        )
        qs = queryservice.QueryService()
        conn = qs.connection()
        self.intQuery = queryservice.InteractiveQuery(conn)

    def get_schema_hierarchy_details(self):
        """A function to parse the schema hierarchy information"""
        schema_hierarchy_sql = f"""SELECT   to_json(_experience), 
                                           to_json(commerce), 
                                           to_json(application), 
                                           _id, 
                                           eventType, to_json(identityMap), timestamp 
                                  FROM {self.source_table} LIMIT 5"""
        return self.intQuery.query(schema_hierarchy_sql, output="dataframe")

    def get_schema(self) -> pd.DataFrame:
        """Return the schema description as a dataframe."""
        schema_description_sql = f""" SELECT table_name, 
                                            column_name, 
                                            data_type 
                                     FROM information_schema.columns 
                                     WHERE table_name={self.source_table} """
        return self.intQuery.query(schema_description_sql, output="dataframe")

    def describe_statistics(self) -> pd.DataFrame:
        """Analyze schema statistics and return in a dataframe"""
        analyze_table_sql = f"ANALYZE TABLE {self.source_table}"
        return self.intQuery.query(analyze_table_sql, output="dataframe")

    def get_basic_statistics(self) -> pd.DataFrame:
        """Calculate some basic statistics and return these as part of a dataframe"""
        basic_statistics_sql = f"""SELECT COUNT(_id) as 'totalRows', 
                                         COUNT(DISTINCT identityMap['ECID'][0]['id']) as distinctUsers 
                                  FROM {self.source_table}"""
        return self.intQuery.query(basic_statistics_sql, output="dataframe")

    def get_distinct_users(self):
        """Retrieve the number of distinct users"""
        distinct_users_sql = f"""SELECT COUNT(DISTINCT identityMap['ECID'][0].id) 
                                FROM {self.source_table} LIMIT 5"""
        return self.intQuery.query(distinct_users_sql, output="dataframe")

    def get_sampled_metadata(self):
        meta_sql = "SELECT sample_meta('multichannel_experience_event_dataset_v2')"
        return self.intQuery.query(meta_sql, output="dataframe")

    def get_funnel_analysis_data(self) -> pd.DataFrame:
        """Return the funnel of how many users actually fill out the webForm"""
        funnel_analysis_sql = f"""SELECT eventType, 
                                        COUNT(DISTINCT identityMap['ECID'][0]['id']) as "distinctUsers",
                                        COUNT(_id) as "distinctEvents" 
                                 FROM {self.source_table} 
                                 GROUP BY eventType 
                                 ORDER BY distinctUsers DESC"""
        return self.intQuery.query(funnel_analysis_sql, output="dataframe")

    def get_event_correlation(self, funnel_df: pd.DataFrame):
        """Return the correlation analysis associated with events predict the web.formFilledOut outcome."""
        basic_event_coorelation_sql = f"""SELECT  eventType_First, 
                                               eventType_Later,
                                               COUNT(DISTINCT userId) as "distinctUsers" 
                                         FROM 
                                                    (SELECT a.eventType as eventType_First, 
                                                            b.eventType as eventType_Later, 
                                                            a.identityMap['ECID'][0]['id'] as userID 
                                                     FROM {self.source_table} a
                                                     JOIN {self.source_table} b
                                                     ON a.identityMap['ECID'][0]['id'] = b.identityMap['ECID'][0]['id']
                                                     WHERE a.timestamp <= b.timestamp)
                                         GROUP BY eventType_First, eventType_Later
                                         ORDER BY distinctUsers DESC"""
        coocc_matrix = self.get_result_as_df(basic_event_coorelation_sql)
        individual_counts = funnel_df
        cocc_with_individual = coocc_matrix.merge(individual_counts, left_on="eventType_First", right_on="eventType")
        cocc_with_individual["probability"] = cocc_with_individual["distinctUsers_x"] / cocc_with_individual[
            "distinctUsers_y"]
        return cocc_with_individual

    def get_full_correlation(self):
        """Return the full correlation from the webform events"""
        full_correlation_sql = f"""SELECT SUM(webFormsFilled) as webFormsFilled_totalUsers,
                                         SUM(advertisingClicks) as advertisingClicks_totalUsers, 
                                         SUM(productViews) as productViews_totalUsers, 
                                         SUM(productPurchases) as productPurchases_totalUsers, 
                                         SUM(propositionDismisses) as propositionDismisses_totaUsers, 
                                         SUM(propositionDisplays) as propositionDisplays_totaUsers, 
                                         SUM(propositionInteracts) as propositionInteracts_totalUsers, 
                                         SUM(emailClicks) as emailClicks_totalUsers, 
                                         SUM(emailOpens) as emailOpens_totalUsers, 
                                         SUM(webLinkClicks) as webLinksClicks_totalUsers, 
                                         SUM(webPageViews) as webPageViews_totalusers, 
                                         corr(webFormsFilled, emailOpens) as webForms_EmalOpens, 
                                         corr(webFormsFilled, advertisingClicks) as webForms_advertisingClicks, 
                                         corr(webFormsFilled, productViews) as webForms_productViews, 
                                         corr(webFormsFilled, productPurchases) as webForms_productPurchases, 
                                         corr(webFormsFilled, propositionDismisses) as webForms_propositionDismisses, 
                                         corr(webFormsFilled, propositionInteracts) as webForms_propositionInteracts, 
                                         corr(webFormsFilled, emailClicks) as webForms_emailClicks, corr(webFormsFilled, emailOpens) as webForms_emailOpens, 
                                         corr(webFormsFilled, emailSends) as webForms_emailSends, corr(webFormsFilled, webLinkClicks) as webForms_webLinkClicks, 
                                         corr(webFormsFilled, webPageViews) as webForms_webPageViews FROM( SELECT identityMap['ECID'][0]['id'] as userID, 
                                         SUM(CASE WHEN eventType='web.formFilledOut' THEN 1 ELSE 0 END) as webFormsFilled, 
                                         SUM(CASE WHEN eventType='advertising.clicks' THEN 1 ELSE 0 END) as advertisingClicks, 
                                         SUM(CASE WHEN eventType='commerce.productViews' THEN 1 ELSE 0 END) as productViews, 
                                         SUM(CASE WHEN eventType='commerce.productPurchases' THEN 1 ELSE 0 END) as productPurchases, 
                                         SUM(CASE WHEN eventType='decisioning.propositionDismiss' THEN 1 ELSE 0 END) as propositionDismisses, 
                                         SUM(CASE WHEN eventType='decisioning.propositionDisplay' THEN 1 ELSE 0 END) as propositionDisplays, 
                                         SUM(CASE WHEN eventType='decisioning.propositionInteract' THEN 1 ELSE 0 END) as propositionInteracts, 
                                         SUM(CASE WHEN eventType='directMarketing.emailClicked' THEN 1 ELSE 0 END) as emailClicks, 
                                         SUM(CASE WHEN eventType='directMarketing.emailOpened' THEN 1 ELSE 0 END) as emailOpens, 
                                         SUM(CASE WHEN eventType='directMarketing.emailSent' THEN 1 ELSE 0 END) as emailSends, 
                                         SUM(CASE WHEN eventType='web.webinteraction.linkClicks' THEN 1 ELSE 0 END) as webLinkClicks, 
                                         SUM(CASE WHEN eventType='web.webinteraction.pageViews' THEN 1 ELSE 0 END) as webPageViews 
                                  FROM {self.source_table} GROUP BY userId)"""
        correlation_df = self.intQuery.query(full_correlation_sql, output="dataframe")
        cols = correlation_df.columns
        corrdf = correlation_df[[col for col in cols if ("webForms_" in col)]].melt()
        corrdf["feature"] = corrdf["variable"].apply(lambda x: x.replace("webForms_", ""))
        corrdf["pearsonCorrelation"] = corrdf["value"]
        corrdf.fillna(0)
        return corrdf


ModuleNotFoundError: No module named 'psycopg2'