In [3]:
import pandas as pd 
import requests
from io import StringIO
 
#API details for the first dataset
base_url_1 = 'https://data.melbourne.vic.gov.au/api/explore/v2.1/catalog/datasets/'
dataset_id_1 = 'microclimate-sensors-data'
url_1 = f"{base_url_1}{dataset_id_1}/exports/csv"
 
#API details for the second dataset
base_url_2 = 'https://data.melbourne.vic.gov.au/api/explore/v2.1/catalog/datasets/'
dataset_id_2 = 'tree-canopies-2021-urban-forest'
url_2 = f"{base_url_1}{dataset_id_2}/exports/csv"
 

response_1 = requests.get(url_1)
df_1 = pd.read_csv(StringIO(response_1.text), delimiter=';')
 
 

response_2 = requests.get(url_2)
df_2 = pd.read_csv(StringIO(response_2.text), delimiter=';')
 
print("Column Names for first dataset")
print(df_1.columns.tolist())
 
print("Column Names for second dataset")
print(df_2.columns.tolist())

Column Names for first dataset
['device_id', 'received_at', 'sensorlocation', 'latlong', 'minimumwinddirection', 'averagewinddirection', 'maximumwinddirection', 'minimumwindspeed', 'averagewindspeed', 'gustwindspeed', 'airtemperature', 'relativehumidity', 'atmosphericpressure', 'pm25', 'pm10', 'noise']
Column Names for second dataset
['geo_point_2d', 'geo_shape']


In [3]:
print(df_1.head())
print(df_2.head)

            device_id                received_at  \
0  ICTMicroclimate-10  2024-11-03T02:17:17+00:00   
1  ICTMicroclimate-08  2024-11-03T01:56:45+00:00   
2  ICTMicroclimate-06  2024-11-03T02:06:04+00:00   
3  ICTMicroclimate-02  2024-11-03T01:59:01+00:00   
4  ICTMicroclimate-11  2024-11-03T01:55:37+00:00   

                                      sensorlocation  \
0                                   1 Treasury Place   
1  Swanston St - Tram Stop 13 adjacent Federation...   
2  Tram Stop 7B - Melbourne Tennis Centre Precinc...   
3                         101 Collins St L11 Rooftop   
4                                   1 Treasury Place   

                    latlong  minimumwinddirection  averagewinddirection  \
0  -37.8128595, 144.9745395                  87.0                 186.0   
1  -37.8184515, 144.9678474                   0.0                 194.0   
2  -37.8194993, 144.9787211                   0.0                 148.0   
3   -37.814604, 144.9702991                 297.0 

In [1]:
# Data Cleaning for Microclimate Sensors Data (df_1)
def clean_microclimate_data(df):
    
    df.columns = df.columns.str.strip()
    
    
    df = df.dropna(axis=1, thresh=len(df) * 0.5)
    
    
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        df[col].fillna(df[col].mean(), inplace=True)
    
    
    df = df.drop_duplicates()
    
    
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    return df

# Data Cleaning for Tree Canopy Data (df_2)
def clean_tree_canopy_data(df):
    
    df.columns = df.columns.str.strip()
    
    
    df = df.dropna(axis=1, thresh=len(df) * 0.5)
    
    
    if 'tree_canopy_percentage' in df.columns:
        df['tree_canopy_percentage'].fillna(0, inplace=True)
    
    
    df = df.drop_duplicates()
    
    
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    return df


In [4]:
#cleaning functions
df_1_cleaned = clean_microclimate_data(df_1)
df_2_cleaned = clean_tree_canopy_data(df_2)

#cleaned data
print("Microclimate Data After Cleaning:")
print(df_1_cleaned.info())

print("\nTree Canopy Data After Cleaning:")
print(df_2_cleaned.info())


Microclimate Data After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155423 entries, 0 to 155422
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   device_id             155423 non-null  object 
 1   received_at           155423 non-null  object 
 2   sensorlocation        149280 non-null  object 
 3   latlong               143940 non-null  object 
 4   minimumwinddirection  155423 non-null  float64
 5   averagewinddirection  155423 non-null  float64
 6   maximumwinddirection  155423 non-null  float64
 7   minimumwindspeed      155423 non-null  float64
 8   averagewindspeed      155423 non-null  float64
 9   gustwindspeed         155423 non-null  float64
 10  airtemperature        155423 non-null  float64
 11  relativehumidity      155423 non-null  float64
 12  atmosphericpressure   155423 non-null  float64
 13  pm25                  155423 non-null  float64
 14  pm10              