In [1]:
import boto3
import pandas as pd
from io import StringIO

def lambda_handler(event, context):
    
    # Define your source and destination S3 bucket names
    source_bucket_name = 'group-6-landing-zone'
    destination_bucket_name = 'group-6-staging-zone'
    
    # Define the key of the file you want to process
    source_key1 = 'green_taxi.csv'
    source_key2 = 'weather.csv'
    
    # Initialize S3 client
    s3 = boto3.client('s3')
    
    # Fetch the first file from S3
    response1 = s3.get_object(Bucket=source_bucket_name, Key=source_key1)
    response2 = s3.get_object(Bucket=source_bucket_name, Key=source_key2)
    
    # Read the content of the first file
    content1 = response1['Body'].read().decode('utf-8')
    content2 = response2['Body'].read().decode('utf-8')
    
    # Convert the content to a pandas DataFrame
    taxi_df = pd.read_csv(StringIO(content1))
    weather_df = pd.read_csv(StringIO(content2))

    
    # Green Taxi Data Handling
    taxi_df['lpep_pickup_datetime'] = pd.to_datetime(taxi_df['lpep_pickup_datetime'])
    taxi_df['lpep_dropoff_datetime'] = pd.to_datetime(taxi_df['lpep_dropoff_datetime'])
    taxi_df['RatecodeID'] = taxi_df['RatecodeID'].astype(str)
    taxi_df['payment_type'] = taxi_df['payment_type'].astype(str)
    taxi_df['trip_type'] = taxi_df['trip_type'].astype(str)
    
    # Null Handling
    taxi_df = taxi_df.drop(columns = 'ehail_fee')
    taxi_df = taxi_df.dropna()
    
    # Data Bucketing
    taxi_df['source_date'] = pd.to_datetime(taxi_df['lpep_pickup_datetime'].dt.date)
    # Extracting hour component from timestamp
    taxi_df['hour'] = taxi_df['source_date'].dt.hour

    # Weather Data Handling
    weather_df = weather_df[['datetime','temp', 'feelslike', 'dew', 'humidity','precip', 'snow', 'windspeed', 'conditions' ,'icon']]
    weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
    
    def get_time_of_day(hour):
        if 6 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 18:
            return 'Afternoon'
        elif 18 <= hour < 22:
            return 'Evening'
        else:
            return 'Night'
    
    
    # Data Bucketing
    weather_df['source_date'] = pd.to_datetime(weather_df['datetime'].dt.date)
    weather_df['hour'] = weather_df['datetime'].dt.hour
    weather_df['time_of_day'] = weather_df['hour'].apply(get_time_of_day)
    
    # Data Merging
    merged_df = pd.merge(taxi_df, weather_df, on=['source_date', 'hour'], how='inner')
    merged_df = merged_df.drop(columns = ['Unnamed: 0'])
    
    # Convert the transformed DataFrame back to CSV format
    transformed_csv = merged_df.to_csv(index=False)
    
    # Define the destination key where you want to store the transformed data
    destination_key = 'taxi_weather_merged.csv'
    
    # Upload the transformed data to the destination S3 bucket
    s3.put_object(Body=transformed_csv, Bucket=destination_bucket_name, Key=destination_key)
    
    return {
        'statusCode': 200,
        'body': 'Transformation completed successfully!'
    }