# Data Journey Day 2: BigQuery ML

<table align="left">

  <td>
    <a href="https://github.com/AmritRaj23/data-journey/blob/main/day-1/ELT%20(Extract%20Load%20Transform)/DataJourney_elt.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/AmritRaj23/data-journey/blob/main/day-1/ELT%20(Extract%20Load%20Transform)/DataJourney_elt.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
     </a>
  </td>
</table>
</table>
<br/><br/><br/>


In this Notebook we make use of GCPs BigQuery ML capabilities to train machine learning models in only a few lines of SQL syntax.

### **Step 1:** Preparation and Authentication
Set the right parameters and authenticate your user. 

In [None]:
project_id = "<project-id>"
location   = "<location>" #This is currently necessary
dataset_name = "<dataset-name>"

In [None]:
from google.cloud import bigquery
from google.cloud.bigquery import Client, QueryJobConfig

client = bigquery.Client(project=project_id)

### **Step 2:** Create ML training dataset

In [None]:
training_dataset = 'cc_train_dataset'

view_id  = "{}.{}.{}".format(project_id, dataset_name, training_dataset)
view     = bigquery.Table(view_id)

view.view_query = f"""
  
  SELECT
    dem.*,
    IFNULL(beh.cnt_user_engagement, 0) AS cnt_user_engagement,
    IFNULL(beh.cnt_level_start_quickplay, 0) AS cnt_level_start_quickplay,
    IFNULL(beh.cnt_level_end_quickplay, 0) AS cnt_level_end_quickplay,
    IFNULL(beh.cnt_level_complete_quickplay, 0) AS cnt_level_complete_quickplay,
    IFNULL(beh.cnt_level_reset_quickplay, 0) AS cnt_level_reset_quickplay,
    IFNULL(beh.cnt_post_score, 0) AS cnt_post_score,
    IFNULL(beh.cnt_spend_virtual_currency, 0) AS cnt_spend_virtual_currency,
    IFNULL(beh.cnt_ad_reward, 0) AS cnt_ad_reward,
    IFNULL(beh.cnt_challenge_a_friend, 0) AS cnt_challenge_a_friend,
    IFNULL(beh.cnt_completed_5_levels, 0) AS cnt_completed_5_levels,
    IFNULL(beh.cnt_use_extra_steps, 0) AS cnt_use_extra_steps,
    ret.user_first_engagement,
    ret.churned
  FROM
    {dataset_name}.user_returninginfo ret
  LEFT OUTER JOIN
    {dataset_name}.user_demographics dem
  ON 
    ret.user_pseudo_id = dem.user_pseudo_id
  LEFT OUTER JOIN 
    {dataset_name}.user_aggregate_behaviour beh
  ON
    ret.user_pseudo_id = beh.user_pseudo_id
  WHERE ret.bounced = 0
  LIMIT 2000 
"""

# Create the view
view = client.create_table(view, exists_ok=True)
print(f"Created {view.table_type}: {str(view.reference)}")

### **Step 3:** Data Exploration
Get a feeling of the data and explore different facets and features

To remember: This is how the final training dataset looks like

| User Demographic Data | User Behavioral Data | Label 
| --- | --- | --- |
| country | cnt_user_engagement | churned
| operating_system | cnt_level_start_quickplay |  |
| language | cnt_level_end_quickplay |  |
|  |  cnt_level_complete_quickplay |  |
|  |  cnt_level_reset_quickplay |  |
|  |  cnt_post_score |  |
|  |  cnt_spend_virtual_currency |  |
|  |  cnt_ad_reward |  |
|  |  cnt_challenge_a_friend |  |
|  |  cnt_completed_5_levels |  |
|  |  cnt_use_extra_steps |  |
|  |  user_first_engagement |  | **bold text**

In [None]:
query = f"""SELECT * FROM `{dataset_name}.{training_dataset}`"""
job = client.query(query)
df = job.to_dataframe()
df.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=1, ncols=2)
plt.subplots_adjust(right=2.0)

# which country do users belong to?
df.country.value_counts(sort=True)[:5].plot(kind='pie',ax=axes[0])
# which operating system is widely used? 
df.operating_system.value_counts().plot(kind='pie', ax=axes[1])

In [None]:
# How many users churned?
df.churned.value_counts().plot(kind='pie')

### **Step 4:** Train Propensity Churn Model

In [None]:
model_name = 'model_3'

query = f"""
    CREATE OR REPLACE MODEL `{project_id}.{dataset_name}.{model_name}`
  
    OPTIONS(MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
           INPUT_LABEL_COLS = ['churned'],
           AUTO_CLASS_WEIGHTS=TRUE,
           DATA_SPLIT_METHOD='NO_SPLIT',
           ENABLE_GLOBAL_EXPLAIN=TRUE,
           model_registry="vertex_ai",
           vertex_ai_model_id="customer_churn_1",
           vertex_ai_model_version_aliases=["test-version"])
    AS
    SELECT *
    FROM `{project_id}.{dataset_name}.{training_dataset}`
    """
client.query(query)

### **Feature Engineering**: How about we create new features from the **user_first_engagement** timestamp field?

Let's use the [TRANSFORM](https://cloud.google.com/bigquery-ml/docs/bigqueryml-transform) method in BigQuery ML to extract month, day of year, day of week and hour from the **user_first_enagagement** field. 

Using the TRANSFORM clause, you can specify all preprocessing during model creation. The preprocessing is **automatically** applied during the prediction and evaluation phases of machine learning 😎

In [None]:
query = """
    CREATE OR REPLACE MODEL `<project-id>.<dataset-name>.<model-name>`
 
    TRANSFORM(
     EXTRACT(MONTH from TIMESTAMP_MICROS(user_first_engagement)) as month,
     EXTRACT(DAYOFYEAR from TIMESTAMP_MICROS(user_first_engagement)) as julianday,
     EXTRACT(DAYOFWEEK from TIMESTAMP_MICROS(user_first_engagement)) as dayofweek,
     EXTRACT(HOUR from TIMESTAMP_MICROS(user_first_engagement)) as hour,
     * EXCEPT(user_first_engagement, user_pseudo_id)
    )
 
    OPTIONS(MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
           INPUT_LABEL_COLS = ['churned'],
           AUTO_CLASS_WEIGHTS=TRUE,
           DATA_SPLIT_METHOD='NO_SPLIT',
           ENABLE_GLOBAL_EXPLAIN=TRUE,
           model_registry="vertex_ai",
           vertex_ai_model_id="customer_churn_2",
           vertex_ai_model_version_aliases=["test-version"]
           )
    AS
    SELECT *
    FROM `<project-id>.<dataset-name>.<train-dataset-table>`
    
    """
client.query(query)