In [1]:
# imports 

import requests
import time
import pandas as pd

In [2]:
API_KEY = "991315e1063647a409ef16a4cca0b499e2838f620ab178ebc97ad9024b3ac5ca"
URL = "https://api.openaq.org/v3/locations?coordinates=12.9716,77.5946&radius=20000&limit=1000"

headers = {"X-API-Key": API_KEY}

response = requests.get(URL, headers=headers)
if response.status_code == 200:
    data = response.json()
    results = data.get("results", [])
    if results:
        all_rows = []
        for location in results:
            base_data = {
                "id": location.get("id"),
                "name": location.get("name"),
                "country_name": location.get("country", {}).get("name"),
                "latitude": location.get("coordinates", {}).get("latitude"),
                "longitude": location.get("coordinates", {}).get("longitude"),
                "datetime_first_utc": location.get("datetimeFirst", {}).get("utc") if location.get("datetimeFirst") else None,
                "datetime_last_utc": location.get("datetimeLast", {}).get("utc") if location.get("datetimeLast") else None
            }

            sensors = location.get("sensors", [])
            for sensor in sensors:
                sensor_data = {
                    "sensor_id": sensor.get("id"),
                    "sensor_name": sensor.get("name"),
                    "parameter": sensor.get("parameter", {}).get("name"),
                    "units": sensor.get("parameter", {}).get("units"),
                }
                all_rows.append({**base_data, **sensor_data})

        df = pd.DataFrame(all_rows)
    else:
        print("No results found.")
else:
    print("Failed to fetch data.")

In [3]:
df['datetime_first_utc'] = pd.to_datetime(df['datetime_first_utc'])
df['datetime_last_utc'] = pd.to_datetime(df['datetime_last_utc'])

# Extract date and time into new columns
df['date_first_utc'] = df['datetime_first_utc'].dt.date
df['date_last_utc'] = df['datetime_last_utc'].dt.date

df.drop(columns=['datetime_first_utc','datetime_last_utc'], inplace=True)

In [4]:
df.head()

Unnamed: 0,id,name,country_name,latitude,longitude,sensor_id,sensor_name,parameter,units,date_first_utc,date_last_utc
0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,2016-03-22,2018-02-22
1,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,1518,no2 µg/m³,no2,µg/m³,2016-03-22,2018-02-22
2,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,13852,pm25 µg/m³,pm25,µg/m³,2016-03-22,2018-02-22
3,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,13853,so2 µg/m³,so2,µg/m³,2016-03-22,2018-02-22
4,594,"BTM Layout, Bengaluru - KSPCB",India,12.912811,77.609219,1048,co µg/m³,co,µg/m³,2016-03-22,2018-02-22


In [None]:
API_KEY = "991315e1063647a409ef16a4cca0b499e2838f620ab178ebc97ad9024b3ac5ca"
HEADERS = {"X-API-Key": API_KEY}
BASE_URL = "https://api.openaq.org/v3/sensors/{sensor_id}/days"

all_records = []

for _, row in df.iterrows():
    sid = row["sensor_id"]
    param = row["parameter"]
    start_date = row["date_first_utc"]
    end_date = row["date_last_utc"]
    name = row["name"]
    country_name = row["country_name"]
    latitude = row["latitude"]
    longitude = row["longitude"]
    units = row["units"]

    page = 1

    while True:
        url = BASE_URL.format(sensor_id=sid)
        params = {
            "limit": 1000,
            "page": page,
            "date_from": start_date,
            "date_to": end_date,
        }
        resp = requests.get(url, headers=HEADERS, params=params, timeout=10)

        if resp.status_code == 200:
            data = resp.json()
            results = data.get("results", [])
            if not results:
                break  

            for rec in results:
                all_records.append({
                    "id": row["id"],
                    "name": name,
                    "country_name": country_name,
                    "latitude": latitude,
                    "longitude": longitude,
                    "sensor_id": sid,
                    "sensor_name": f"{param} {units}",
                    "parameter": param,
                    "units": units,
                    "value": rec["value"],
                    "date": rec["period"]["datetimeFrom"]["utc"][:10],
                })

            page += 1
            time.sleep(0.2)  # Space out requests
        else:
            break

final_df = pd.DataFrame(all_records)
final_df.to_csv(r"C:\Users\Cashapona\Desktop\openAQ.csv")

In [4]:
final_df = pd.read_csv(r"C:\Users\Cashapona\Desktop\openAQ.csv")
final_df.head()

Unnamed: 0.1,Unnamed: 0,id,name,country_name,latitude,longitude,sensor_id,sensor_name,parameter,units,value,date
0,0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,751.0,2016-03-21
1,1,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,942.0,2016-03-22
2,2,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,371.0,2016-03-23
3,3,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,927.0,2016-03-27
4,4,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,930.0,2016-03-28


In [5]:
selected_parameters = ['co', 'no2', 'pm25', 'so2']

filtered_df = final_df[final_df['parameter'].isin(selected_parameters)]

valid_names = (
    filtered_df.groupby('name')['parameter']
    .apply(lambda x: set(selected_parameters).issubset(set(x))))

names_with_all_params = valid_names[valid_names].index.tolist()

new_df = filtered_df[filtered_df['name'].isin(names_with_all_params)]

In [6]:
new_df.to_csv(r"C:\Users\Cashapona\Desktop\NewopenAQ.csv")

In [7]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,id,name,country_name,latitude,longitude,sensor_id,sensor_name,parameter,units,value,date
0,0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,751.0,2016-03-21
1,1,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,942.0,2016-03-22
2,2,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,371.0,2016-03-23
3,3,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,927.0,2016-03-27
4,4,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,930.0,2016-03-28


In [8]:
new_df['date'] = pd.to_datetime(new_df['date'], errors='coerce')
new_df['year'] = new_df['date'].dt.year
area_years = new_df.groupby('name')['year'].apply(set).reset_index()

# Define required years
required_years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2025]

# Check for missing years
area_years['missing_years'] = area_years['year'].apply(lambda years: set(required_years) - years)

# Display areas and their missing years
area_years[['name', 'missing_years']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['date'] = pd.to_datetime(new_df['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['year'] = new_df['date'].dt.year


Unnamed: 0,name,missing_years
0,"BTM Layout, Bengaluru - CPCB","{2016, 2017}"
1,"BTM Layout, Bengaluru - KSPCB","{2019, 2020, 2021, 2022, 2025}"
2,"BWSSB Kadabesanahalli, Bengaluru - CPCB","{2016, 2017, 2025}"
3,"BWSSB Kadabesanahalli, Bengaluru - KSPCB","{2019, 2020, 2021, 2022, 2025}"
4,"Bapuji Nagar, Bengaluru - KSPCB","{2016, 2017}"
5,"Hebbal, Bengaluru - KSPCB","{2016, 2017}"
6,"Hombegowda Nagar, Bengaluru - KSPCB","{2016, 2017}"
7,"Jayanagar 5th Block, Bengaluru - KSPCB","{2016, 2017}"
8,"Kasturi Nagar, Bengaluru - KSPCB","{2016, 2017, 2018, 2019, 2020, 2021, 2022}"
9,"Peenya, Bengaluru - CPCB","{2016, 2017}"


Only finding the quality label for areas that have the subset data.

In [9]:
# Creating a subset of required years
required_years_subset = [2018, 2019, 2020, 2021, 2022, 2025]

# Filter areas with complete data for the subset
complete_areas_subset = area_years[
    area_years['year'].apply(lambda years: set(required_years_subset).issubset(years))]['name']

consistent_df = new_df[new_df['name'].isin(complete_areas_subset)]
print(f"Consistent dataset shape: {consistent_df.shape}")

Consistent dataset shape: (30492, 13)


In [10]:
consistent_df = consistent_df.drop(columns=['Unnamed: 0'])

In [11]:
consistent_df.head()

Unnamed: 0,id,name,country_name,latitude,longitude,sensor_id,sensor_name,parameter,units,value,date,year
14980,5548,"BTM Layout, Bengaluru - CPCB",India,12.913522,77.59508,14657,co µg/m³,co,µg/m³,45000.0,2018-03-08,2018
14981,5548,"BTM Layout, Bengaluru - CPCB",India,12.913522,77.59508,14657,co µg/m³,co,µg/m³,43000.0,2018-03-09,2018
14982,5548,"BTM Layout, Bengaluru - CPCB",India,12.913522,77.59508,14657,co µg/m³,co,µg/m³,45600.0,2018-03-10,2018
14983,5548,"BTM Layout, Bengaluru - CPCB",India,12.913522,77.59508,14657,co µg/m³,co,µg/m³,41000.0,2018-03-11,2018
14984,5548,"BTM Layout, Bengaluru - CPCB",India,12.913522,77.59508,14657,co µg/m³,co,µg/m³,24300.0,2018-03-12,2018


In [13]:
# the areas that we will evaluate the quality of 
print(consistent_df['name'].unique())

['BTM Layout, Bengaluru - CPCB' 'Peenya, Bengaluru - CPCB'
 'Jayanagar 5th Block, Bengaluru - KSPCB'
 'Bapuji Nagar, Bengaluru - KSPCB' 'Silk Board, Bengaluru - KSPCB'
 'Hombegowda Nagar, Bengaluru - KSPCB' 'Hebbal, Bengaluru - KSPCB']


In [14]:
pivot_df = consistent_df.pivot_table(index='name', columns='parameter', values='value', aggfunc='mean')
pivot_df.columns.name = None 
pivot_df = pivot_df.reset_index()  

pivot_df.head()

Unnamed: 0,name,co,no2,pm25,so2
0,"BTM Layout, Bengaluru - CPCB",1435.847382,21.672243,29.863616,9.117986
1,"Bapuji Nagar, Bengaluru - KSPCB",0.7127,23.223241,35.394241,8.654211
2,"Hebbal, Bengaluru - KSPCB",512.737843,19.365472,25.699791,6.814042
3,"Hombegowda Nagar, Bengaluru - KSPCB",0.633955,18.143111,23.220064,8.232142
4,"Jayanagar 5th Block, Bengaluru - KSPCB",0.674487,25.172626,29.862435,5.750722


In [15]:
# Check for NaN values 
nan_summary = pivot_df.isna().sum()

if pivot_df.isna().values.any():
    print("\nThere are NaN values in the DataFrame.")
else:
    print("\nThere are no NaN values in the DataFrame.")


There are no NaN values in the DataFrame.


In [16]:
# Weighted score function
def compute_air_quality_score(row):
    weights = {'co': 1.0, 'no2': 1.2, 'so2': 1.2, 'pm25': 1.5}
    limits = {'co': 34, 'no2': 400, 'so2': 1600, 'pm25': 250}
    
    total_weight = sum(weights.values())
    score = 0
    for param, weight in weights.items():
        norm_value = row[param] / limits[param]
        score += norm_value * weight
    
    return round(score / total_weight, 3)

# Apply weighted score calculation
pivot_df['weighted_score'] = pivot_df.apply(compute_air_quality_score, axis=1)

# Assign quality labels
def label_air_quality(score):
    if score <= 0.3:
        return 'Good'
    elif score <= 0.6:
        return 'Moderate'
    elif score <= 0.9:
        return 'Unhealthy'
    elif score <= 1.2:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

pivot_df['quality_label'] = pivot_df['weighted_score'].apply(label_air_quality)

# Sort by score (best to worst)
sorted_df = pivot_df.sort_values('weighted_score', ascending=True).reset_index(drop=True)

In [17]:
# Final output
output_df = sorted_df[['name', 'weighted_score', 'quality_label']]
output_df.head(13)

Unnamed: 0,name,weighted_score,quality_label
0,"Hombegowda Nagar, Bengaluru - KSPCB",0.045,Good
1,"Jayanagar 5th Block, Bengaluru - KSPCB",0.057,Good
2,"Bapuji Nagar, Bengaluru - KSPCB",0.063,Good
3,"Hebbal, Bengaluru - KSPCB",3.122,Hazardous
4,"Silk Board, Bengaluru - KSPCB",4.516,Hazardous
5,"BTM Layout, Bengaluru - CPCB",8.67,Hazardous
6,"Peenya, Bengaluru - CPCB",11.119,Hazardous


Imputed Values

In [None]:
new_df['year'] = pd.to_datetime(new_df['date']).dt.year

# Define the required years
required_years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2025]

# Check missing years for each area
area_years = new_df.groupby('name')['year'].apply(set).reset_index()
area_years['missing_years'] = area_years['year'].apply(lambda years: set(required_years) - years)

# Impute missing years for each area
imputed_rows = []
for _, row in area_years.iterrows():
    area_name = row['name']
    missing_years = row['missing_years']
    
    # Get the mean parameter values for the current area
    area_data = new_df[new_df['name'] == area_name]
    mean_values = area_data.groupby('parameter')['value'].mean().to_dict()
    
    # Create rows for the missing years
    for year in missing_years:
        for parameter, mean_value in mean_values.items():
            # Use the mean value for imputation
            imputed_rows.append({
                'id': None,  # Placeholder
                'name': area_name,
                'country_name': area_data['country_name'].iloc[0],
                'latitude': area_data['latitude'].iloc[0],
                'longitude': area_data['longitude'].iloc[0],
                'sensor_id': None,  # Placeholder
                'sensor_name': None,  # Placeholder
                'parameter': parameter,
                'units': area_data[area_data['parameter'] == parameter]['units'].iloc[0],
                'value': mean_value,
                'date': f'{year}-01-01',  # Placeholder date
                'year': year
            })

# Create a DataFrame for imputed rows
imputed_df = pd.DataFrame(imputed_rows)

# Combine original and imputed data
combined_df = pd.concat([new_df, imputed_df], ignore_index=True)

# Verify the final dataset
print(f"Final dataset shape: {combined_df.shape}")
combined_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['year'] = pd.to_datetime(new_df['date']).dt.year


Final dataset shape: (40465, 13)


Unnamed: 0.1,Unnamed: 0,id,name,country_name,latitude,longitude,sensor_id,sensor_name,parameter,units,value,date,year
0,0.0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,751.0,2016-03-21 00:00:00,2016
1,1.0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,942.0,2016-03-22 00:00:00,2016
2,2.0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,371.0,2016-03-23 00:00:00,2016
3,3.0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,927.0,2016-03-27 00:00:00,2016
4,4.0,412,"Peenya, Bengaluru - KSPCB",India,13.0339,77.513211,722,co µg/m³,co,µg/m³,930.0,2016-03-28 00:00:00,2016


In [19]:
combined_df = combined_df.drop(columns=['Unnamed: 0'])

In [20]:
pivot_df = combined_df.pivot_table(index='name', columns='parameter', values='value', aggfunc='mean')
pivot_df.columns.name = None 
pivot_df = pivot_df.reset_index()  

pivot_df.head()

Unnamed: 0,name,co,no2,pm25,so2
0,"BTM Layout, Bengaluru - CPCB",1435.847382,21.672243,29.863616,9.117986
1,"BTM Layout, Bengaluru - KSPCB",620.052292,20.117715,28.113742,7.67534
2,"BWSSB Kadabesanahalli, Bengaluru - CPCB",2144.067828,24.851579,37.977543,5.154623
3,"BWSSB Kadabesanahalli, Bengaluru - KSPCB",674.539851,30.09757,22.289515,9.410132
4,"Bapuji Nagar, Bengaluru - KSPCB",0.7127,23.223241,35.394241,8.654211


In [21]:
# Weighted score function
def compute_air_quality_score(row):
    weights = {'co': 1.0, 'no2': 1.2, 'so2': 1.2, 'pm25': 1.5}
    limits = {'co': 34, 'no2': 400, 'so2': 1600, 'pm25': 250}
    
    total_weight = sum(weights.values())
    score = 0
    for param, weight in weights.items():
        norm_value = row[param] / limits[param]
        score += norm_value * weight
    
    return round(score / total_weight, 3)

# Apply weighted score calculation
pivot_df['weighted_score'] = pivot_df.apply(compute_air_quality_score, axis=1)

# Assign quality labels
def label_air_quality(score):
    if score <= 0.3:
        return 'Good'
    elif score <= 0.6:
        return 'Moderate'
    elif score <= 0.9:
        return 'Unhealthy'
    elif score <= 1.2:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

pivot_df['quality_label'] = pivot_df['weighted_score'].apply(label_air_quality)

# Sort by score (best to worst)
sorted_df = pivot_df.sort_values('weighted_score', ascending=True).reset_index(drop=True)

In [22]:
# Final output
output_df = sorted_df[['name', 'weighted_score', 'quality_label']]
output_df.head(13)

Unnamed: 0,name,weighted_score,quality_label
0,"Hombegowda Nagar, Bengaluru - KSPCB",0.045,Good
1,"Shivapura_Peenya, Bengaluru - KSPCB",0.049,Good
2,"Jayanagar 5th Block, Bengaluru - KSPCB",0.057,Good
3,"Bapuji Nagar, Bengaluru - KSPCB",0.063,Good
4,"Hebbal, Bengaluru - KSPCB",3.122,Hazardous
5,"BTM Layout, Bengaluru - KSPCB",3.77,Hazardous
6,"BWSSB Kadabesanahalli, Bengaluru - KSPCB",4.096,Hazardous
7,"Peenya, Bengaluru - KSPCB",4.341,Hazardous
8,"Silk Board, Bengaluru - KSPCB",4.516,Hazardous
9,"BTM Layout, Bengaluru - CPCB",8.67,Hazardous
