In [1]:
import requests
from datetime import datetime
from datetime import date
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

cities_list = pd.read_excel('cities_list.xlsx')

API_KEY = os.getenv('API_KEY')

def get_air_pollution_data(API_KEY, lat, lon):
    url = f'http://api.openweathermap.org/data/2.5/air_pollution/forecast?lat={lat}&lon={lon}&appid={API_KEY}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f'Error requisition: {response.status_code}')
        return None

In [2]:
cities_list

Unnamed: 0,name,lon,lat,country
0,Abidjan,-4.00167,5.35444,Ivory Coast
1,Abu Dhabi,54.39696,24.45118,United Arab Emirates
2,Abuja,7.49508,9.05785,Nigeria
3,Accra,-0.19690,5.55602,Ghana
4,Addis Ababa,38.74689,9.02497,Ethiopia
...,...,...,...,...
366,Yaounde,11.51667,3.86667,Cameroon
367,Yekaterinburg,60.61220,56.85190,Russia
368,Yerevan,44.51361,40.18111,Armenia
369,Yokohama,139.65000,35.43333,Japan


In [3]:
def process_pollution_data(data, city, country):
    if not data or 'list'not in data:
        return None
    
    formatted_data = []
    for item in data['list']:
        row = {
            'dt': item['dt'],
            'aqi': item['main']['aqi'],
            'city': city,
            'country': country,
            'co': item['components']['co'],
            'no': item['components']['no'],
            'no2': item['components']['no2'],
            'o3': item['components']['o3'],
            'so2': item['components']['so2'],
            'pm2_5': item['components']['pm2_5'],
            'pm10': item['components']['pm10'],
            'nh3': item['components']['nh3']

        }
        if isinstance(item.get('components'), dict):
            row.update(item['components'])
            if 'coord' in item:
                row['lon'] = item['coord'].get('lon', None)
                row['lat'] = item['coord'].get('lat', None)
        formatted_data.append(row)
        
    return formatted_data

In [4]:
all_data = []

for index, row in cities_list.iterrows():
    lon = row.lon
    lat = row.lat
    country = row.country
    city = row.name
    data = get_air_pollution_data(API_KEY, lat, lon)

    if data:
        city_data = process_pollution_data(data, city, country)
        if city_data:
            all_data.append(city_data)

In [12]:
df = pd.DataFrame(all_data)

In [13]:
columns = ['dt', 'city', 'country', 'co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3', 'lon', 'lat']

In [14]:
correction_data = [item for sublist in all_data for item in sublist]
df = pd.DataFrame(correction_data)

In [15]:
def format_date(timestamp):
    if pd.isna(timestamp):
        return None
    elif isinstance(timestamp, (int, float)):
        return pd.to_datetime(timestamp, unit='s')
    return None

In [16]:
for index, row in cities_list.iterrows():
    lon = row.lon
    lat = row.lat
    country = row.country
    city = row.name
    
    data = get_air_pollution_data(API_KEY, lat, lon)
    
    if data:
        city_data = process_pollution_data(data, city, country)
        if city_data:
            all_data.extend(city_data)

In [17]:
df['dt'] = df['dt'].apply(format_date)

In [18]:
df.head(10)

Unnamed: 0,dt,aqi,city,country,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,2025-06-04 16:00:00,1,0,Ivory Coast,109.91,0.01,0.14,42.55,0.1,1.19,1.96,0.01
1,2025-06-04 17:00:00,1,0,Ivory Coast,110.1,0.0,0.15,42.07,0.1,1.21,2.0,0.01
2,2025-06-04 18:00:00,1,0,Ivory Coast,109.49,0.0,0.16,41.47,0.1,1.24,2.03,0.01
3,2025-06-04 19:00:00,1,0,Ivory Coast,108.96,0.0,0.16,41.02,0.09,1.27,2.01,0.01
4,2025-06-04 20:00:00,1,0,Ivory Coast,107.96,0.0,0.15,40.47,0.1,1.3,2.11,0.0
5,2025-06-04 21:00:00,1,0,Ivory Coast,106.95,0.0,0.15,39.92,0.1,1.34,2.27,0.0
6,2025-06-04 22:00:00,1,0,Ivory Coast,105.65,0.0,0.15,39.36,0.11,1.38,2.46,0.0
7,2025-06-04 23:00:00,1,0,Ivory Coast,104.56,0.0,0.14,38.67,0.11,1.38,2.49,0.0
8,2025-06-05 00:00:00,1,0,Ivory Coast,103.22,0.0,0.14,38.09,0.11,1.36,2.41,0.0
9,2025-06-05 01:00:00,1,0,Ivory Coast,102.12,0.0,0.13,37.51,0.1,1.31,2.23,0.0
