## Import 

In [1]:
import os
import boto3
import pandas as pd
from datetime import datetime
from botocore.exceptions import NoCredentialsError

from dotenv import load_dotenv

load_dotenv()

S3_ACCESSPOINT = os.getenv('S3_ACCESSPOINT')
AWS_PROFILE= os.getenv('AWS_PROFILE')
apiKey = os.getenv('API_KEY')

## Extracrt data

In [2]:
demographic = pd.read_csv('Demographic_Data.csv')
geographic = pd.read_csv('Geographic_Data.csv')

In [3]:
demographic

Unnamed: 0,Location,Population,Density (people/km²),Urbanization (%),Average Income (USD),Education Level (% with Bachelor's or higher)
0,Los Angeles,10039107,3276,89,60000,32
1,Paris,2140526,21383,98,45000,29
2,Tokyo,13929286,6169,100,55000,37
3,Antananarivo,1391433,3097,69,1000,10
4,Nairobi,4397073,6000,61,1500,12
5,Lima,9674755,3220,81,4500,15


In [4]:
geographic

Unnamed: 0,Location,Altitude (m),Proximity to Industry (km)
0,Los Angeles,89,5.0
1,Paris,35,3.0
2,Tokyo,40,2.0
3,Antananarivo,1276,0.5
4,Nairobi,1795,1.0
5,Lima,1540,1.5


## Transform

In [5]:
location_data = pd.merge(geographic, demographic, on='Location')
location_data 

Unnamed: 0,Location,Altitude (m),Proximity to Industry (km),Population,Density (people/km²),Urbanization (%),Average Income (USD),Education Level (% with Bachelor's or higher)
0,Los Angeles,89,5.0,10039107,3276,89,60000,32
1,Paris,35,3.0,2140526,21383,98,45000,29
2,Tokyo,40,2.0,13929286,6169,100,55000,37
3,Antananarivo,1276,0.5,1391433,3097,69,1000,10
4,Nairobi,1795,1.0,4397073,6000,61,1500,12
5,Lima,1540,1.5,9674755,3220,81,4500,15


In [6]:
coord = pd.DataFrame(
  {
    'lat': [  34.052235, 48.866667,  34.886306, -18.777192,  0.170945,  -9.181352],
    'lon': [-118.243683,  2.333333, 134.379711,  46.854328, 37.903969, -75.002365],
    'Location': ['Los Angeles', 'Paris', 'Tokyo', 'Antananarivo', 'Nairobi', 'Lima']
  })
coord

Unnamed: 0,lat,lon,Location
0,34.052235,-118.243683,Los Angeles
1,48.866667,2.333333,Paris
2,34.886306,134.379711,Tokyo
3,-18.777192,46.854328,Antananarivo
4,0.170945,37.903969,Nairobi
5,-9.181352,-75.002365,Lima


In [7]:
location_data = pd.merge(location_data, coord, on='Location')
location_data

Unnamed: 0,Location,Altitude (m),Proximity to Industry (km),Population,Density (people/km²),Urbanization (%),Average Income (USD),Education Level (% with Bachelor's or higher),lat,lon
0,Los Angeles,89,5.0,10039107,3276,89,60000,32,34.052235,-118.243683
1,Paris,35,3.0,2140526,21383,98,45000,29,48.866667,2.333333
2,Tokyo,40,2.0,13929286,6169,100,55000,37,34.886306,134.379711
3,Antananarivo,1276,0.5,1391433,3097,69,1000,10,-18.777192,46.854328
4,Nairobi,1795,1.0,4397073,6000,61,1500,12,0.170945,37.903969
5,Lima,1540,1.5,9674755,3220,81,4500,15,-9.181352,-75.002365


In [8]:
def get_polution_aqi_data(row, apiKey):
    lat = row['lat']
    lon = row['lon']
    response = requests.get(f'http://api.openweathermap.org/data/2.5/air_pollution?lat={lat}&lon={lon}&appid={apiKey}')
    data = response.json()
    return data['list'][0]['main']['aqi']

def get_polution_components_data(row, apiKey):
    lat = row['lat']
    lon = row['lon']
    response = requests.get(f'http://api.openweathermap.org/data/2.5/air_pollution?lat={lat}&lon={lon}&appid={apiKey}')
    data = response.json()
    return data['list'][0]['components']

location_data['AQI Pollution'] = location_data.apply(lambda row: get_polution_aqi_data(row, apiKey), axis=1)
components_df = location_data.apply(lambda row: pd.Series(get_polution_components_data(row, apiKey)), axis=1)

location_data = location_data.join(components_df)

location_data.drop(columns='lat', axis=1, inplace=True)
location_data.drop(columns='lon', axis=1, inplace=True)
location_data['date'] = datetime.now().date()
location_data.set_index('Location', inplace=True)

location_data.head(5)

Unnamed: 0_level_0,Altitude (m),Proximity to Industry (km),Population,Density (people/km²),Urbanization (%),Average Income (USD),Education Level (% with Bachelor's or higher),AQI Pollution,co,no,no2,o3,so2,pm2_5,pm10,nh3,date
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Los Angeles,89,5.0,10039107,3276,89,60000,32,3,514.03,0.2,89.11,20.03,9.78,12.97,22.22,3.9,2024-08-10
Paris,35,3.0,2140526,21383,98,45000,29,2,317.1,1.29,20.22,7.51,2.06,10.7,18.77,1.6,2024-08-10
Tokyo,40,2.0,13929286,6169,100,55000,37,2,303.75,0.03,0.25,87.26,0.08,2.81,3.62,0.27,2024-08-10
Antananarivo,1276,0.5,1391433,3097,69,1000,10,1,293.73,0.04,0.53,49.35,0.4,2.01,4.48,0.78,2024-08-10
Nairobi,1795,1.0,4397073,6000,61,1500,12,1,263.69,0.02,0.9,50.78,0.29,3.5,8.22,1.54,2024-08-10


## Load data

In [None]:
def upload_to_aws_s3(local_file, bucket, s3_file, profile_name):
    if profile_name:
        session = boto3.Session(profile_name=profile_name)
    else:
        session = boto3.Session()
    s3 = session.client('s3')
    try:
        s3.upload_file(local_file, bucket, s3_file)
        print(f"Upload Successful: {s3_file}")
        return True
    except FileNotFoundError:
        print("The file was not found")
        return False
    except NoCredentialsError:
        print("Credentials not available")
        return False


file_name = f'air_pollution_{datetime.now().date()}.csv'
location_data.to_csv(file_name)
upload_to_aws_s3(
    file_name,
    S3_ACCESSPOINT,
    f'AirPollutionData/{file_name}',
    AWS_PROFILE
)