In [1]:
import requests
import json
from datetime import datetime
import time
import os

#function to get whather contintion for each city
def get_weather(city, api_key):
    url = f"http://api.weatherapi.com/v1/current.json?key={api_key}&q={city}&aqi=no"
    response = requests.get(url)

    if response.status_code == 200:   #code for successfully request
        data = response.json()
        return {
            'City': city,
            'Temperature': data['current']['temp_c'],  #celsius degree
            'Condition': data['current']['condition']['text'], #weather condition descricption
            'Humidity': data['current']['humidity'], #humidity (%)
            'Wind Speed': data['current']['wind_kph'] #wind speed (km/h)
        }
    else:
        print(f"Failed to retrieve data for {city}. Status code and status text:{response.status_code} - {response.text}")    
        return None

# Function to get air quality data for each city
def get_air_quality_data(city, state, country, api_key):
    url = f"https://api.airvisual.com/v2/city?city={city}&state={state}&country={country}&key={api_key}"
    response = requests.get(url)

    if response.status_code == 200:  # Successful request
        data = response.json()
        
        if data['status'] == "success":
            city_data = data['data']   #access to data
            return {
                "city": city_data['city'],  
                "state": city_data['state'],
                "country": city_data['country'],
                "AQI": city_data['current']['pollution']['aqius'],  # US measurement for air quality index
                "pollutants": city_data['current']['pollution']     # other pollution measurement 
                
            }
    else:
        print(f"Error retrieving data for {city}: {response.status_code} - {response.text}")

    return None

# API Keys
weather_api_key = 'INSERT_API_KEY_1'
air_quality_api_key = "INSERT_API_KEY_2"


cities = [
    ("Amsterdam", "North Holland", "Netherlands"),
    ("Athens", "Attica", "Greece"),
    ("Belgrade", "Central Serbia", "Serbia"),
    ("Berlin", "Berlin", "Germany"),
    ("Brussels", "Brussels Capital", "Belgium"),
    ("Budapest", "Central Hungary", "Hungary"),
    ("Copenhagen", "Capital Region", "Denmark"),
    ("Dublin", "Leinster", "Ireland"),
    ("Helsinki", "Uusimaa", "Finland"),
    ("Lisbon", "Lisbon", "Portugal"),
    ("London", "England", "United Kingdom"),
    ("Madrid", "Madrid", "Spain"),
    ("Moscow", "Moscow", "Russia"),
    ("Oslo", "Oslo", "Norway"),
    ("Paris", "Île-de-France", "France"),
    ("Prague", "Praha", "Czech Republic"),
    ("Riga", "Riga", "Latvia"),
    ("Rome", "Latium", "Italy"),
    ("Stockholm", "Stockholm", "Sweden"),
    ("Vienna", "Vienna", "Austria"),
    ("Zagreb", "Zagreb", "Croatia")
]

# dictionary for storing city data
documents = {}
list_doc=[] #use a list due to have for each city one document
# today's date
today_date = datetime.now().strftime('%d-%m-%Y')  

weather_fields = ['City', 'Temperature', 'Condition', 'Humidity', 'Wind Speed']
air_quality_fields = ['city', 'state', 'country', 'AQI', 'pollutants']


for capitale, stato, paese in cities:
    print(f"Retrieving data for {capitale}...")
    
    air_quality_info = get_air_quality_data(capitale, stato, paese, air_quality_api_key)
    weather_info = get_weather(capitale, weather_api_key)
    
    # Data validation
    if air_quality_info and weather_info:    # check if there are non-None values
#####################################################################################################
# DATA QUALITY CHECKS        
#####################################################################################################
# Check for required fields
            if (all(field in weather_info for field in weather_fields) and
                all(field in air_quality_info for field in air_quality_fields)):
            
                # Check data types and values consistency
                if (isinstance(weather_info['Temperature'], (int, float)) and 
                    isinstance(weather_info['Humidity'], (int, float)) and 
                    isinstance(weather_info['Wind Speed'], (int, float)) and 
                    isinstance(air_quality_info['AQI'], (int, float)) and
                    weather_info['Humidity'] >= 0 and 
                    weather_info['Wind Speed'] >= 0 and
                    air_quality_info['AQI'] >= 0):
                    print("All checks passed.")
                    documents[capitale] = {
                        capitale: {
                            today_date: {
                                'Air Quality': air_quality_info,
                                'Weather': weather_info
                            }
                        }
                    }
                    list_doc.append(documents[capitale])
                else:
                    print(f"Invalid data types or data inconsistency for {capitale}.")
            else:
                print(f"Missing fields in data for {capitale}.")
#####################################################################################
# END DATA QUALITY CHECKS
#####################################################################################

    time.sleep(13) # Minimum delay to avoid exceeding request limit
#folder path where the data will store
# to save the file in the folder of interest, simply call the file with the path where you want to save the file
output_directory = ".../file_jason_aqi/" 

#os.path.join to join the folder path and file name
output_filename = os.path.join(output_directory, f"{today_date}_data.json")


with open(output_filename, 'w') as json_file:
    json.dump(list_doc, json_file, indent=4)

print(f"The integrated data '{today_date}_data.json' has been saved in the 'file_jason_aqi' folder.")


Retrieving data for Amsterdam...
All checks passed.
Retrieving data for Athens...
All checks passed.
Retrieving data for Belgrade...
All checks passed.
Retrieving data for Berlin...
All checks passed.
Retrieving data for Brussels...
All checks passed.
Retrieving data for Budapest...
All checks passed.
Retrieving data for Copenhagen...
All checks passed.
Retrieving data for Dublin...
All checks passed.
Retrieving data for Helsinki...
All checks passed.
Retrieving data for Lisbon...
All checks passed.
Retrieving data for London...
All checks passed.
Retrieving data for Madrid...
All checks passed.
Retrieving data for Moscow...
All checks passed.
Retrieving data for Oslo...
All checks passed.
Retrieving data for Paris...
All checks passed.
Retrieving data for Prague...
All checks passed.
Retrieving data for Riga...
All checks passed.
Retrieving data for Rome...
All checks passed.
Retrieving data for Stockholm...
All checks passed.
Retrieving data for Vienna...
All checks passed.
Retrievin

## Merge the historical JSON files in one single JSON file

In [2]:
import json
import os

def combine_json_files_from_folder(folder_path, output_file):
    combined_data = []  # list due to have for each day one document

    # Iteration for each JSON file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):  
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                data = json.load(file)

                
                for element in data:
                    for city, daily_data in element.items():
                        for date, details in daily_data.items():
                            # Check if data already exists
                            date_entry = next((item for item in combined_data if date in item), None)

                            if date_entry is None:
                                # Create a new entry for the date if it does not exist
                                date_entry = {date: {}}
                                combined_data.append(date_entry)
                            ##########################################################################
                            #deleting redundant data
                            ###########################################################################                            
                            air_quality = details['Air Quality']
                            if 'city' in air_quality:
                                del air_quality['city']
                               
                            weather = details['Weather']
                            if 'City' in weather:
                                del weather['City']
                            pollutants = air_quality['pollutants']   
                            if 'ts' in pollutants:
                                del pollutants['ts']

                            # Add the city and details as a key in the date dictionary
                            date_entry[date][city] = {
                                "details": details
                            }

    # final JSON file
    with open(output_file, 'w') as f:
        json.dump(combined_data, f, indent=4)

    print(f"Combined data saved as 'city_weather_aqi.json'")

folder_path = '.../file_jason_aqi'  
output_file = '.../city_weather_aqi.json'
combine_json_files_from_folder(folder_path, output_file)

Combined data saved as 'city_weather_aqi.json'


# Data Enrichment

In [3]:
import json

combined_data_path = ".../city_weather_aqi.json"

with open(combined_data_path, 'r') as file:
    data = json.load(file)  

def enrich_data(data):
    for date_record in data:
        for date, cities in date_record.items():
            for city, details in cities.items():
                aqi = details['details']['Air Quality']['AQI']
                # the following descriptions are taken from https://www.airnow.gov/aqi/aqi-basics/
                if aqi <= 50:
                    description = 'Good'
                elif aqi <= 100:
                    description = 'Moderate'
                elif aqi <= 150:
                    description = 'Unhealthy for Sensitive Groups'
                elif aqi <= 200:
                    description = 'Unhealthy'
                elif aqi <= 300:
                    description = 'Very Unhealthy'
                else:
                    description = 'Hazardous'
                details['details']['Air Quality']['Description'] = description
    return data

enriched_data = enrich_data(data)

enriched_data_json = json.dumps(enriched_data, indent=4)
output_file_path = ".../enriched_data.json" #download file
with open(output_file_path, 'w') as output_file:
    json.dump(enriched_data, output_file, indent=4)
print(f"Saved enriched data as 'enriched_data.json'.")

Saved enriched data as 'enriched_data.json'.


# Data Storage

In [4]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['weather_db']  #create db
collection = db['weather_data']  # creat collection

# saving data
with open(".../enriched_data.json") as file:
    data = json.load(file)

# insert dato into collection
collection.insert_many(data) 

InsertManyResult([ObjectId('6825c8300e3b6220b77b7432'), ObjectId('6825c8300e3b6220b77b7433'), ObjectId('6825c8300e3b6220b77b7434'), ObjectId('6825c8300e3b6220b77b7435'), ObjectId('6825c8300e3b6220b77b7436'), ObjectId('6825c8300e3b6220b77b7437'), ObjectId('6825c8300e3b6220b77b7438'), ObjectId('6825c8300e3b6220b77b7439'), ObjectId('6825c8300e3b6220b77b743a'), ObjectId('6825c8300e3b6220b77b743b'), ObjectId('6825c8300e3b6220b77b743c'), ObjectId('6825c8300e3b6220b77b743d'), ObjectId('6825c8300e3b6220b77b743e'), ObjectId('6825c8300e3b6220b77b743f'), ObjectId('6825c8300e3b6220b77b7440'), ObjectId('6825c8300e3b6220b77b7441'), ObjectId('6825c8300e3b6220b77b7442'), ObjectId('6825c8300e3b6220b77b7443'), ObjectId('6825c8300e3b6220b77b7444'), ObjectId('6825c8300e3b6220b77b7445'), ObjectId('6825c8300e3b6220b77b7446'), ObjectId('6825c8300e3b6220b77b7447'), ObjectId('6825c8300e3b6220b77b7448'), ObjectId('6825c8300e3b6220b77b7449'), ObjectId('6825c8300e3b6220b77b744a'), ObjectId('6825c8300e3b6220b77b74

# Query 

In [6]:
city = "Amsterdam"
date = "05-03-2025"
result = collection.find_one({date: {"$exists": True}})
if result:
    air_quality = result[date][city]['details']['Air Quality']
    print(f"Air Quality in {city} on {date}: {air_quality}")

Air Quality in Amsterdam on 05-03-2025: {'state': 'North Holland', 'country': 'Netherlands', 'AQI': 149, 'pollutants': {'aqius': 149, 'mainus': 'p2', 'aqicn': 75, 'maincn': 'p2'}, 'Description': 'Unhealthy for Sensitive Groups'}


In [7]:
import numpy as np
import sys
def temp_aqi_correlation(date_str):
    query = {date_str: {"$exists": True}}
    result = collection.find_one(query)
    
    if result:
        l_temp = []
        l_aqi = []
        
        for city_name, data in result[date_str].items():
                temp = data['details']['Weather']['Temperature']
                aqi = data['details']['Air Quality']['AQI']
                l_temp.append(temp)
                l_aqi.append(aqi)

        
        if l_temp and l_aqi:
            correlation = np.corrcoef(l_temp, l_aqi)[0, 1]
            cor_3 = round(correlation,3)
            print(f"Correlation beteween °C and AQI: {cor_3}")
            return correlation


day=input("Which day do you want analyze? ")
print(f"Day: {day}")
if day not in [f"{i:02d}" for i in range(1, 32)]:
    print("Invalid input. Please enter a valid month between 01 and 31.") 
    sys.exit()

month= input("Which month do you want analyze? ")
print(f"Month: {month}")
if month not in [f"{i:02d}" for i in range(1, 13)]:
    print("Invalid input. Please enter a valid day between 01 and 12.")
    sys.exit()
date = f"{day}-{month}-2025"
correlation = temp_aqi_correlation(date)

Which day do you want analyze?  05


Day: 05


Which month do you want analyze?  03


Month: 03
Correlation beteween °C and AQI: 0.153


In [8]:
import sys

def windiest_cities(date_str):
    query = {date_str: {"$exists": True}}
    result = collection.find_one(query)
    
    if result:
        cities_data = []
        for city_name, data in result[date_str].items():

                wind_speed = data['details']['Weather']['Wind Speed']
                cities_data.append({
                    'city': city_name,
                    'wind_speed': wind_speed,
                    'aqi': data['details']['Air Quality']['AQI']
                })

        
        sorted_cities = sorted(cities_data, key=lambda x: x['wind_speed'], reverse=True)
        return sorted_cities[:5] #first 5 rows 



day = input("Which day do you want to analyze? ")
#print(f"Day: {day}")
if day not in [f"{i:02d}" for i in range(1, 32)]:
    print("Invalid input. Please enter a valid day between 01 and 31.") 
    sys.exit()

month = input("Which month do you want to analyze? ")
#print(f"Month: {month}")
if month not in [f"{i:02d}" for i in range(1, 13)]:
    print("Invalid input. Please enter a valid month between 01 and 12.")
    sys.exit()

date = f"{day}-{month}-2025"
windy_cities = windiest_cities(date)
print("\n")
print("Windest cities:")
for city in windy_cities:
    print(f"{city['city']}: {city['wind_speed']} km/h, AQI: {city['aqi']}")

Which day do you want to analyze?  05
Which month do you want to analyze?  05




Windest cities:
Amsterdam: 24.5 km/h, AQI: 29
Lisbon: 23.0 km/h, AQI: 28
Helsinki: 22.0 km/h, AQI: 30
London: 21.2 km/h, AQI: 27
Brussels: 20.5 km/h, AQI: 36


In [9]:
def worst_aqi_cities(date_str, limit=5):
    query = {date_str: {"$exists": True}}
    result = collection.find_one(query)
    
    if result:
        cities_data = []
        for city_name, data in result[date_str].items():
            aqi = data['details']['Air Quality']['AQI']
            cities_data.append({
                'city': city_name,
                'aqi': aqi,
                'temperature': data['details']['Weather']['Temperature']
            })
        sorted_cities = sorted(cities_data, key=lambda x: x['aqi'], reverse=True)
        return sorted_cities[:limit]

day = input("Which day do you want to analyze? ")
print(f"Day: {day}")
if day not in [f"{i:02d}" for i in range(1, 32)]:
    print("Invalid input. Please enter a valid day between 01 and 31.") 
    sys.exit()

month = input("Which month do you want to analyze? ")
print(f"Month: {month}")
if month not in [f"{i:02d}" for i in range(1, 13)]:
    print("Invalid input. Please enter a valid month between 01 and 12.")
    sys.exit()

print("\n")
date = f"{day}-{month}-2025"
worst_cities = worst_aqi_cities(date)

if worst_cities:  # Check if the list is not empty
    print(f"Cities with the worst air quality index on {date}:")
    for city in worst_cities:
        print(f"{city['city']}: AQI {city['aqi']}, Temperature {city['temperature']}°C")


Which day do you want to analyze?  13


Day: 13


Which month do you want to analyze?  04


Month: 04


Cities with the worst air quality index on 13-04-2025:
Amsterdam: AQI 86, Temperature 14.2°C
London: AQI 73, Temperature 16.1°C
Copenhagen: AQI 63, Temperature 9.0°C
Paris: AQI 57, Temperature 15.3°C
Prague: AQI 57, Temperature 18.3°C
