------Preliminary Data Exploration Script-------

Connects to the community Google Sheet using gspread and environment-stored credentials,
loads the data into a Pandas DataFrame, and performs a preliminary column analysis.
Identifies empty columns, inspects the first 37 relevant columns for Silver-layer modeling,
and flags additional columns for exclusion. 

Note:
Raw data in all columns is still ingested into the Bronze layer as JSON, preserving full historical information while allowing targeted extraction for Silver-layer tables later.


In [1]:
#libraries
import os
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd


In [2]:

#gsheet credentials from environment variables
UGASHEET_CREDENTIALS = os.getenv("UGASHEET_CRED_PATH")
SHEET_KEY = os.getenv("UGASHEET_KEY")
WORKSHEET_NAME = os.getenv("UGASHEET_NAME", "atendees")
DATABASE_URL = os.getenv("DB_URL")



In [3]:
# connecting to the google sheet using the credentials
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(UGASHEET_CREDENTIALS, scope)
gc = gspread.authorize(creds) #authorize the gspread API client to read the data
sh = gc.open_by_key(SHEET_KEY) #open the google sheet by its key
ws = sh.worksheet(WORKSHEET_NAME) #selecting the worksheet

In [4]:
data = ws.get_all_values()#get all data from the worksheet
headers = data.pop(0)#extract the headers since the first row contains headers on the data table in the google sheet
df = pd.DataFrame(data, columns=headers) #create a pandas dataframe from the data and headers

In [None]:
df.info() #check the dataframe info

In [None]:
column_headers = df.columns #get the list of column headers
print("Column Headers:", column_headers)  

In [None]:
#identify empty column headers 
empty_col_headers = df[[col for col in df.columns if col is None or str(col).strip() == '']]

#check if there are contents in these columns with no headers
check = empty_col_headers.info() 
print(check)


In [8]:

df = df.drop(columns=empty_col_headers)#drop columns with empty headers since they contain no data


In [None]:
# examine top_20_columns 
df.iloc[:, :20].describe(include='all')


In [None]:
# Select next 20 columns
df.iloc[:, 20:40].describe(include='all')

In [None]:
#remaining columns
df.iloc[:, 35:51].describe(include='all')

Note: The last columns from 37 to 51 are completely empty. and they will be dropped before loading to 
silver layer of the database along with those columns that no headers. 

In [None]:
# Identify columns to drop (columns 37 to 51)
df.iloc[:, 37:51].columns
columns_to_drop = df.iloc[:, 37:51].columns.tolist()
print("Columns to drop:", columns_to_drop)