## EDA Part 1

### Load credentials from Project root

In [1]:
import os
from dotenv import load_dotenv

# Load .env from project root
load_dotenv()  # looks for .env in the current working directory

AWS_ACCESS_KEY_ID = os.getenv("AWS_Acess_Key_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_Secret_Access_Key")
AWS_REGION = os.getenv("AWS_REGION", "us-east-1")

ATHENA_S3_OUTPUT = os.getenv("ATHENA_S3_OUTPUT")
ATHENA_DATABASE = os.getenv("ATHENA_DATABASE")

AWS_ACCESS_KEY_ID[:4], AWS_REGION, ATHENA_DATABASE


('AKIA', 'us-east-1', 's3_atmos_stclair')

### Load the data by creating boto session using awswrangler

In [2]:
import boto3
import awswrangler as wr
import pandas as pd

session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
)


### All the Tables in the Athena Database

In [3]:
df_tables = wr.catalog.tables(database=ATHENA_DATABASE, boto3_session=session)
table_names = df_tables["Table"].tolist()

print("Table names:", table_names)
print("Number of tables:", len(table_names))

Table names: ['achievement_user_completions', 'achievements', 'banner_clicks', 'bdg_user_completions', 'certification_users', 'comment_likes', 'community_comments', 'country_states', 'instances', 'instances_v', 'li_attempts', 'li_stats_agg_all', 'li_stats_agg_all_v', 'mi_user_mission_step_completions', 'offboarded_instances', 'poll_user_completions', 'rank_user_completions', 'rcs_historic_users', 'rcs_historic_users_v', 'rcs_lifetime_stats_by_month_v', 'rcs_user_lifetimes_no_stats_v', 'reward_prizes', 'reward_user_claims', 'reward_user_claims_v', 'rewards', 'sale_user_submissions', 'snippet_view_logs', 'snippets', 'survey_user_completions', 'test2_v', 'test_v', 'users']
Number of tables: 32


In [4]:
df_tables

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,s3_atmos_stclair,achievement_user_completions,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
1,s3_atmos_stclair,achievements,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
2,s3_atmos_stclair,banner_clicks,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
3,s3_atmos_stclair,bdg_user_completions,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
4,s3_atmos_stclair,certification_users,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
5,s3_atmos_stclair,comment_likes,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
6,s3_atmos_stclair,community_comments,,EXTERNAL_TABLE,"instance_id, generated_for_date, generated_ts,...",
7,s3_atmos_stclair,country_states,,EXTERNAL_TABLE,"country_short_code, country_name, state_name, ...",
8,s3_atmos_stclair,instances,,EXTERNAL_TABLE,"instance_id, client_name, site_name, site_url,...",
9,s3_atmos_stclair,instances_v,,VIRTUAL_VIEW,"instance_id, client_name, region_name, site_na...",


### Using rcs_lifetime_stats_by_month_v table for EDA

### Dataset Overview & Data Types Distribution

In [5]:
query_count = "SELECT * FROM rcs_lifetime_stats_by_month_v LIMIT 10000" 
# It is a virtual view type table so it may take longer time to execute

df_rcs_lifetime_stats_by_month_v = wr.athena.read_sql_query(
    sql=query_count,
    database=ATHENA_DATABASE,
    s3_output=ATHENA_S3_OUTPUT,
    boto3_session=session,
    ctas_approach=False,
    use_threads=True,
)

print(f"Shape: {df_rcs_lifetime_stats_by_month_v.shape}")

Shape: (978, 30)


Shape: (978, 30)

In [8]:
df_rcs_lifetime_stats_by_month_v

Unnamed: 0,instance_id,client_name,site_name,region_name,client_region_name,site_region_name,month_date,total_lifetimes,total_user_days_online_this_month,total_users_online_this_month,...,poll_completions,survey_completions,certification_completions,achievement_completions,community_comments,community_replies,community_likes,snippets_created,snippets_viewed,engagement_score
0,DFE7DF87F4419492DD4361FABAE99441,Fitbit,Fitbit Learn,,Fitbit NA,Fitbit Learn NA,2024-02-01,27782,1301,321,...,0,0,0,0,421,144,2264,0,142,3633
1,EA53680F4661F0DE3467171D00E5B6C8,Electrolux,Lära,,Electrolux,Lära,2025-09-01,1888,33,12,...,0,0,0,0,0,0,0,0,16,64
2,22616DD6B409CA4FE178CA8345599B84,UAG,UAG Training,,UAG,UAG Training,2025-06-01,9627,151,38,...,0,0,0,4,0,0,0,0,0,343
3,1625E189E00C17624201AB5919DC12F8,Fitbit,Fitbit Learn,EMEA,Fitbit EMEA,Fitbit Learn EMEA,2023-04-01,4168,296,76,...,4,0,0,6,4,0,10,0,0,379
4,EA53680F4661F0DE3467171D00E5B6C8,Electrolux,Lära,,Electrolux,Lära,2025-08-01,1898,43,16,...,1,0,0,0,0,0,0,0,14,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,DFE7DF87F4419492DD4361FABAE99441,Fitbit,Fitbit Learn,,Fitbit NA,Fitbit Learn NA,2024-03-01,22028,1121,286,...,0,0,0,0,402,142,2057,0,77,3158
974,36410AC36064FFB21FD99D5E628A4D77,Otter,Otter University,,Otter,Otter University,2024-01-01,211286,13290,3593,...,795,184,271,1290,2161,3303,648,0,1787,48147
975,3E752476A44A50560CA4F982F6FF9975,College Athlete Payment System,College Athlete Payment System Learning Hub,,College Athlete Payment System,College Athlete Payment System Learning Hub,2025-09-01,24566,1035,380,...,0,0,0,0,0,0,0,0,0,76
976,A17B5D1C84AC7093025B72EB2653DA56,Lenovo,Lenovo Training Portal,,Lenovo,Lenovo Training Portal,2024-11-01,7716,379,148,...,16,0,0,0,0,0,0,0,0,814


In [15]:
query_count = "SELECT * FROM rcs_historic_users LIMIT 1000" 
# It is a virtual view type table so it may take longer time to execute

df_historic_users= wr.athena.read_sql_query(
    sql=query_count,
    database=ATHENA_DATABASE,
    s3_output=ATHENA_S3_OUTPUT,
    boto3_session=session,
    ctas_approach=False,
    use_threads=True,
)


print(f"Shape: {df_historic_users.shape}")


Shape: (1000, 9)


In [16]:
df_historic_users

Unnamed: 0,instance_id,aggregated_for_date,user_id,previous_status,current_status,route,last_login,prev_last_login,prev_prev_last_login
0,0D90AF0E048EDA389A6ABD0719057E17,2024-10-28,10129,Lost User,Lost User,No Change,2023-05-04,2023-05-03,NaT
1,0D90AF0E048EDA389A6ABD0719057E17,2024-10-28,10158,Lost User,Lost User,No Change,2023-12-18,2023-12-07,2023-07-18
2,0D90AF0E048EDA389A6ABD0719057E17,2024-10-28,10171,Lost User,Lost User,No Change,2023-04-21,NaT,NaT
3,0D90AF0E048EDA389A6ABD0719057E17,2024-10-28,10174,Lost User,Lost User,No Change,2023-06-21,2023-04-21,NaT
4,0D90AF0E048EDA389A6ABD0719057E17,2024-10-28,10187,At Risk MAU,At Risk MAU,No Change,2024-10-11,2023-12-07,2023-12-06
...,...,...,...,...,...,...,...,...,...
995,0D90AF0E048EDA389A6ABD0719057E17,2024-11-30,10304,Lost User,Lost User,No Change,2023-08-08,2023-08-07,NaT
996,0D90AF0E048EDA389A6ABD0719057E17,2024-11-30,10321,Lost User,Lost User,No Change,2024-03-12,2023-08-30,2023-08-18
997,0D90AF0E048EDA389A6ABD0719057E17,2024-11-30,10326,Lost User,Lost User,No Change,2023-08-21,2023-08-18,NaT
998,0D90AF0E048EDA389A6ABD0719057E17,2024-11-30,10331,Lost User,Lost User,No Change,2023-08-24,NaT,NaT
