In [1]:
import requests
from datetime import datetime
from datetime import date
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

cities_list = pd.read_excel('cities_list.xlsx')

API_KEY = os.getenv('API_KEY')

def get_air_pollution_data(API_KEY, lat, lon):
    url = f'http://api.openweathermap.org/data/2.5/air_pollution/forecast?lat={lat}&lon={lon}&appid={API_KEY}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f'Error requisition: {response.status_code}')
        return None

In [3]:
cities_list

Unnamed: 0,name,lon,lat,country
0,Abidjan,-4.00167,5.35444,Ivory Coast
1,Abu Dhabi,54.39696,24.45118,United Arab Emirates
2,Abuja,7.49508,9.05785,Nigeria
3,Accra,-0.19690,5.55602,Ghana
4,Addis Ababa,38.74689,9.02497,Ethiopia
...,...,...,...,...
366,Yaounde,11.51667,3.86667,Cameroon
367,Yekaterinburg,60.61220,56.85190,Russia
368,Yerevan,44.51361,40.18111,Armenia
369,Yokohama,139.65000,35.43333,Japan


In [4]:
def process_pollution_data(data, city, country):
    if not data or 'list'not in data:
        return None
    
    formatted_data = []
    for item in data['list']:
        row = {
            'dt': item['dt'],
            'aqi': item['main']['aqi'],
            'city': city,
            'country': country,
            'co': item['components']['co'],
            'no': item['components']['no'],
            'no2': item['components']['no2'],
            'o3': item['components']['o3'],
            'so2': item['components']['so2'],
            'pm2_5': item['components']['pm2_5'],
            'pm10': item['components']['pm10'],
            'nh3': item['components']['nh3']

        }
        if isinstance(item.get('components'), dict):
            row.update(item['components'])
            if 'coord' in item:
                row['lon'] = item['coord'].get('lon', None)
                row['lat'] = item['coord'].get('lat', None)
        formatted_data.append(row)
        
    return formatted_data

In [7]:
def format_date(timestamp):
    if pd.isna(timestamp):
        return None
    elif isinstance(timestamp, (int, float)):
        return pd.to_datetime(timestamp, unit='s')
    return None

In [9]:
all_data = []
for index, row in cities_list.iterrows():
    lon = row['lon']
    lat = row['lat']
    country = row['country']
    city = row['name']
    data = get_air_pollution_data(API_KEY, lat, lon)
    
    if data:
        city_data = process_pollution_data(data, city, country)
        if city_data:
            all_data.append(city_data)

In [13]:
correction_data = [item for sublist in all_data for item in sublist if isinstance(item, dict)]
df = pd.DataFrame(correction_data)

In [12]:
df.head(10)

Unnamed: 0,dt,aqi,city,country,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,1749063600,1,Abidjan,Ivory Coast,108.96,0.0,0.16,41.02,0.09,1.27,2.01,0.01
1,1749067200,1,Abidjan,Ivory Coast,107.96,0.0,0.15,40.47,0.1,1.3,2.11,0.0
2,1749070800,1,Abidjan,Ivory Coast,106.95,0.0,0.15,39.92,0.1,1.34,2.27,0.0
3,1749074400,1,Abidjan,Ivory Coast,105.65,0.0,0.15,39.36,0.11,1.38,2.46,0.0
4,1749078000,1,Abidjan,Ivory Coast,104.56,0.0,0.14,38.67,0.11,1.38,2.49,0.0
5,1749081600,1,Abidjan,Ivory Coast,103.22,0.0,0.14,38.09,0.11,1.36,2.41,0.0
6,1749085200,1,Abidjan,Ivory Coast,102.12,0.0,0.13,37.51,0.1,1.31,2.23,0.0
7,1749088800,1,Abidjan,Ivory Coast,100.74,0.0,0.13,37.04,0.09,1.24,2.04,0.0
8,1749092400,1,Abidjan,Ivory Coast,99.63,0.0,0.13,36.57,0.09,1.2,1.9,0.0
9,1749096000,1,Abidjan,Ivory Coast,98.64,0.0,0.13,36.14,0.08,1.16,1.78,0.0
