In [None]:
import requests
import pandas as pd
import boto3
import os
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(level=logging.INFO)

# AWS S3 Configuration
S3_BUCKET_NAME = "bigdata2025assignment3"
S3_FILE_NAME = "co2_daily.csv"  # The file name in S3
LOCAL_FILE_PATH = "/Users/macbookair/Desktop/Assignment_3/co2_daily.csv"  # Local file path

# NOAA CO2 Dataset URL
url = "https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_daily_mlo.txt"

# Function to convert decimal year to actual date
def decimal_to_date(decimal_year):
    try:
        # Extract the year and decimal part
        year = int(decimal_year)
        decimal_part = decimal_year - year
        
        # Calculate the day of the year (fraction of 365.25 days)
        day_of_year = int(decimal_part * 365.25)
        
        # Calculate the start date (January 1st of the given year)
        start_date = datetime(year, 1, 1)
        
        # Add the calculated day of the year to the start date
        actual_date = start_date + timedelta(days=day_of_year - 1)  # day_of_year starts from 1
        
        return actual_date.strftime('%Y-%m-%d')
    except Exception as e:
        logging.error(f"Error converting decimal year {decimal_year} to date: {e}")
        return None

# Fetch data from the URL with error handling
try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for HTTP errors
    data_lines = response.text.split("\n")
except requests.exceptions.RequestException as e:
    logging.error(f"Error fetching data from the URL: {e}")
    raise

# Process the data and extract meaningful information
data = []
for line in data_lines:
    if not line.startswith("#") and line.strip():  # Ignore comments and empty lines
        parts = line.split()
        if len(parts) >= 5:
            try:
                year, month, day, decimal_year, co2_value = parts[:5]
                
                # Convert decimal year to actual date
                date = decimal_to_date(float(decimal_year))
                
                # If date is successfully converted, append the data
                if date:
                    co2_value = float(co2_value)
                    data.append([date, co2_value])
                else:
                    logging.warning(f"Skipping invalid date for line: {line}")
            except ValueError:
                logging.warning(f"Skipping invalid data: {line}")

# Convert to Pandas DataFrame
df = pd.DataFrame(data, columns=["date", "co2_ppm"])

# Save DataFrame as a CSV file
df.to_csv(LOCAL_FILE_PATH, index=False)
logging.info(f"Data saved locally as {LOCAL_FILE_PATH}")

# Upload CSV to S3 using boto3's default session
def upload_to_s3(local_file, bucket, s3_file):
   
    try:
        # Initialize the S3 client with default session (AWS credentials from environment)
        s3 = boto3.client(
        "s3",
        aws_access_key_id="AKIAZPPGAAEKCP7YN7TM",
        aws_secret_access_key="7vERWy3Zl/Gec2xRcJuIJ8rCCyJip9PuJrWqQQCe",
        region_name="us-east-2",  # Change based on your AWS region
    )
        # Upload the file to S3
        s3.upload_file(local_file, bucket, s3_file)
        logging.info(f"File uploaded successfully to s3://{bucket}/{s3_file}")
    except Exception as e:
        logging.error(f"Error uploading file to S3: {e}")
        raise

# Call the function to upload
upload_to_s3(LOCAL_FILE_PATH, S3_BUCKET_NAME, S3_FILE_NAME)

In [18]:

SET MY_USER = CURRENT_USER();

SET GITHUB_SECRET_USERNAME = 'Bigdata2025Team5';
SET GITHUB_SECRET_PASSWORD = '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';
SET GITHUB_URL_PREFIX = 'https://github.com/Bigdata2025Team5';
SET GITHUB_REPO_ORIGIN = 'https://github.com/Bigdata2025Team5/Assignment_3.git';


-- ----------------------------------------------------------------------------
-- Create the account level objects (ACCOUNTADMIN part)
-- ----------------------------------------------------------------------------

USE ROLE ACCOUNTADMIN;

-- Roles
CREATE OR REPLACE ROLE CO2_ROLE;
GRANT ROLE CO2_ROLE TO ROLE SYSADMIN;
GRANT ROLE CO2_ROLE TO USER IDENTIFIER($MY_USER);

GRANT CREATE INTEGRATION ON ACCOUNT TO ROLE CO2_ROLE;
GRANT EXECUTE TASK ON ACCOUNT TO ROLE CO2_ROLE;
GRANT EXECUTE MANAGED TASK ON ACCOUNT TO ROLE CO2_ROLE;
GRANT MONITOR EXECUTION ON ACCOUNT TO ROLE CO2_ROLE;
GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE CO2_ROLE;

-- Databases
CREATE OR REPLACE DATABASE CO2_DB;
GRANT OWNERSHIP ON DATABASE CO2_DB TO ROLE CO2_ROLE;

-- Warehouses
CREATE OR REPLACE WAREHOUSE CO2_WH WAREHOUSE_SIZE = XSMALL, AUTO_SUSPEND = 300, AUTO_RESUME= TRUE;
GRANT OWNERSHIP ON WAREHOUSE CO2_WH TO ROLE CO2_ROLE;

-- ----------------------------------------------------------------------------
-- Create the database level objects
-- ----------------------------------------------------------------------------
USE ROLE CO2_ROLE;
USE WAREHOUSE CO2_WH;
USE DATABASE CO2_DB;

-- Schemas
CREATE OR REPLACE SCHEMA INTEGRATIONS;
CREATE OR REPLACE SCHEMA RAW_CO2;
CREATE OR REPLACE SCHEMA Harmonized_CO2;
CREATE OR REPLACE SCHEMA Analytics_CO2;

CREATE OR REPLACE SCHEMA DEV_SCHEMA;
CREATE OR REPLACE SCHEMA PROD_SCHEMA;



CREATE OR REPLACE STAGE RAW_CO2.CO2_EXTERNAL_STAGE  
URL = 's3://bigdata2025assignment3/co2_daily.csv'
CREDENTIALS = (AWS_KEY_ID = 'AKIAZPPGAAEKCP7YN7TM' 
AWS_SECRET_KEY = '7vERWy3Zl/Gec2xRcJuIJ8rCCyJip9PuJrWqQQCe');
 
-- Secrets (schema level)
CREATE OR REPLACE SECRET DEMO_GITHUB_SECRET
  TYPE = password
  USERNAME = $GITHUB_SECRET_USERNAME
  PASSWORD = $GITHUB_SECRET_PASSWORD;

-- API Integration (account level)
USE ROLE ACCOUNTADMIN;

CREATE OR REPLACE API INTEGRATION DEMO_GITHUB_API_INTEGRATION
  API_PROVIDER = GIT_HTTPS_API
  API_ALLOWED_PREFIXES = ($GITHUB_URL_PREFIX)
  ALLOWED_AUTHENTICATION_SECRETS = (DEMO_GITHUB_SECRET)
  ENABLED = TRUE;

-- Git Repository
CREATE OR REPLACE GIT REPOSITORY DEMO_GIT_REPO
  API_INTEGRATION = DEMO_GITHUB_API_INTEGRATION
  GIT_CREDENTIALS = DEMO_GITHUB_SECRET
  ORIGIN = $GITHUB_REPO_ORIGIN;


CREATE OR REPLACE TABLE RAW_CO2.Daily_Measurements (
date STRING ,
co2_ppm FLOAT);

COPY INTO RAW_CO2.Daily_Measurements
    FROM @RAW_CO2.CO2_EXTERNAL_STAGE
    FILE_FORMAT = (
        TYPE = CSV 
        SKIP_HEADER = 1
        FIELD_OPTIONALLY_ENCLOSED_BY = '"'
    )
    ON_ERROR = CONTINUE;

CREATE OR REPLACE STREAM RAW_CO2.DAILY_MEASUREMENTS_STREAM ON TABLE RAW_CO2.DAILY_MEASUREMENTS;

USE ROLE ACCOUNTADMIN;

CREATE EVENT TABLE CO2_DB.INTEGRATIONS.DEMO_EVENTS;
GRANT SELECT ON EVENT TABLE CO2_DB.INTEGRATIONS.DEMO_EVENTS TO ROLE CO2_ROLE;
GRANT INSERT ON EVENT TABLE CO2_DB.INTEGRATIONS.DEMO_EVENTS TO ROLE CO2_ROLE;

ALTER ACCOUNT SET EVENT_TABLE = CO2_DB.INTEGRATIONS.DEMO_EVENTS;
ALTER DATABASE CO2_DB SET LOG_LEVEL = INFO;

SyntaxError: invalid syntax (1166821928.py, line 1)

In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, lag, when
from snowflake.snowpark.window import Window
 
def calculate_co2_percentage_change(session: Session):
    """
    Calculates the percentage change in CO2_PPM from the previous day.
    """
    try:
        # Access harmonized CO2 data table
        harmonized_df = session.table("CO2_DB.HARMONIZED_CO2.CO2_EMISSIONS_HARMONIZED")
 
        # Define window specification ordered by 'DATE'
        window_spec = Window.orderBy(col("DATE"))
 
        # Calculate previous day's CO2_PPM using 'lag'
        harmonized_df = harmonized_df.with_column(
            "PREVIOUS_CO2", lag(col("CO2_PPM")).over(window_spec)
        )
 
        # Calculate percentage change from previous day
        harmonized_df = harmonized_df.with_column(
            "PERCENTAGE_CHANGE",
            when(col("PREVIOUS_CO2").isNotNull(),
                 ((col("CO2_PPM") - col("PREVIOUS_CO2")) / col("PREVIOUS_CO2")) * 100
            ).otherwise(None)
        )
 
        # Show the result
        harmonized_df.show()
 
        # Save the result to a new table
        harmonized_df.write.mode("overwrite").save_as_table("CO2_DB.HARMONIZED_CO2.CO2_EMISSIONS_HARMONIZED_WITH_PERCENTAGE_CHANGE")
 
        # Return the dataframe
        return harmonized_df
 
    except Exception as e:
        print(f"Error in calculate_co2_percentage_change: {e}")
        return None
 
# Main block to execute the script
if __name__ == "__main__":
    try:
        # Create Snowflake session
        session = Session.builder.appName("CO2_Percentage_Change").getOrCreate()
 
        # Call the function to calculate percentage change
        calculate_co2_percentage_change(session)
 
    except Exception as e:
        # Handle errors during session creation or transformation
        print(f"Error in main execution: {e}")
 

In [None]:
CREATE OR REPLACE FUNCTION CO2_DB.HARMONIZED_CO2.CALCULATE_SEASONAL_VARIATION()
RETURNS TABLE (
    MONTH INT,
    AVG_CO2_PPM FLOAT,
    DEVIATION_FROM_ANNUAL_MEAN FLOAT
)
AS
$$
    WITH annual_mean AS (
        SELECT AVG(CO2_PPM) AS mean
        FROM CO2_DB.HARMONIZED_CO2.CO2_EMISSIONS_HARMONIZED
    ),
    monthly_avg AS (
        SELECT 
            EXTRACT(MONTH FROM DATE)::INT AS MONTH,
            AVG(CO2_PPM) AS AVG_CO2_PPM
        FROM CO2_DB.HARMONIZED_CO2.CO2_EMISSIONS_HARMONIZED
        GROUP BY EXTRACT(MONTH FROM DATE)
    )
    SELECT 
        m.MONTH,
        m.AVG_CO2_PPM,
        m.AVG_CO2_PPM - a.mean AS DEVIATION_FROM_ANNUAL_MEAN
    FROM monthly_avg m, annual_mean a
    ORDER BY m.MONTH
$$;

