In [1]:
import requests
import pandas as pd 
from secrets_config import api_key # https://home.openweathermap.org/ 

In [2]:
# how to extract data from multiple cities? e.g. canberra, sydney, etc

# 1. create a list of cities (csv)
# 2. read list of cities (csv) 
# 3. request data for each city (json) and push to a list 
# 4. convert list into dataframe 

In [3]:
# read list of cities
df_cities = pd.read_csv("data/australian_capital_cities.csv")
df_cities.head()

Unnamed: 0,city_name
0,canberra
1,sydney
2,darwin
3,brisbane
4,adelaide


In [4]:
# request data for each city (json) and push to a list 
weather_data = []
for city_name in df_cities["city_name"]:
    params = {
        "q": city_name,
        "units": "metric",
        "appid": api_key
    }
    response = requests.get(f"http://api.openweathermap.org/data/2.5/weather", params=params)
    if response.status_code == 200: 
        weather_data.append(response.json())
    else: 
        raise Exception("Extracting weather api data failed. Please check if API limits have been reached.")

In [5]:
# convert list into dataframe 
df_weather_cities = pd.json_normalize(weather_data)
df_weather_cities.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,wind.gust,rain.1h
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290139,36000,2172517,Canberra,200,149.1281,-35.2835,...,0.0,0,0,2,2081752,AU,1690232666,1690269314,,
1,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290007,36000,2147714,Sydney,200,151.2073,-33.8679,...,2.57,300,0,2,2010638,AU,1690231986,1690268996,,
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290459,34200,2073124,Darwin,200,130.8418,-12.4611,...,1.54,40,0,1,9574,AU,1690234657,1690276101,,
3,"[{'id': 802, 'main': 'Clouds', 'description': ...",stations,10000,1690289924,36000,2174003,Brisbane,200,153.0281,-27.4679,...,2.57,200,40,2,2012892,AU,1690230802,1690269306,,
4,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290182,34200,2078025,Adelaide,200,138.6,-34.9333,...,3.09,50,0,2,2001763,AU,1690235146,1690271887,,


### Transforming data

In [6]:
# set city names to lowercase 
df_weather_cities["city_name"] = df_weather_cities["name"].str.lower()

In [7]:
df_population = pd.read_csv("data/australian_city_population.csv")
df_population.head()

Unnamed: 0,city_name,population
0,canberra,431611
1,sydney,5361466
2,darwin,146982
3,brisbane,2582007
4,adelaide,1378413


In [8]:
df_merged = pd.merge(left=df_weather_cities, right=df_population, on=["city_name"])
df_merged.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,wind.gust,rain.1h,city_name,population
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290139,36000,2172517,Canberra,200,149.1281,-35.2835,...,0,2,2081752,AU,1690232666,1690269314,,,canberra,431611
1,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290007,36000,2147714,Sydney,200,151.2073,-33.8679,...,0,2,2010638,AU,1690231986,1690268996,,,sydney,5361466
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290459,34200,2073124,Darwin,200,130.8418,-12.4611,...,0,1,9574,AU,1690234657,1690276101,,,darwin,146982
3,"[{'id': 802, 'main': 'Clouds', 'description': ...",stations,10000,1690289924,36000,2174003,Brisbane,200,153.0281,-27.4679,...,40,2,2012892,AU,1690230802,1690269306,,,brisbane,2582007
4,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690290182,34200,2078025,Adelaide,200,138.6,-34.9333,...,0,2,2001763,AU,1690235146,1690271887,,,adelaide,1378413


In [9]:
# print out all columns and pick only relevant ones 
df_merged.columns

Index(['weather', 'base', 'visibility', 'dt', 'timezone', 'id', 'name', 'cod',
       'coord.lon', 'coord.lat', 'main.temp', 'main.feels_like',
       'main.temp_min', 'main.temp_max', 'main.pressure', 'main.humidity',
       'wind.speed', 'wind.deg', 'clouds.all', 'sys.type', 'sys.id',
       'sys.country', 'sys.sunrise', 'sys.sunset', 'wind.gust', 'rain.1h',
       'city_name', 'population'],
      dtype='object')

In [10]:
df_selected = df_merged[["dt", "id", "name", "main.temp", "population"]] 
df_selected.head()

Unnamed: 0,dt,id,name,main.temp,population
0,1690290139,2172517,Canberra,2.48,431611
1,1690290007,2147714,Sydney,10.31,5361466
2,1690290459,2073124,Darwin,15.99,146982
3,1690289924,2174003,Brisbane,14.2,2582007
4,1690290182,2078025,Adelaide,9.62,1378413


In [11]:
# convert unix timestamp column to datetime 
df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")
df_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")


Unnamed: 0,dt,id,name,main.temp,population
0,2023-07-25 13:02:19,2172517,Canberra,2.48,431611
1,2023-07-25 13:00:07,2147714,Sydney,10.31,5361466
2,2023-07-25 13:07:39,2073124,Darwin,15.99,146982
3,2023-07-25 12:58:44,2174003,Brisbane,14.2,2582007
4,2023-07-25 13:03:02,2078025,Adelaide,9.62,1378413


In [12]:
# rename colum names to more meaningful names
df_selected = df_selected.rename(columns={
    "dt": "datetime",
    "main.temp": "temperature"
})
df_selected.head()

Unnamed: 0,datetime,id,name,temperature,population
0,2023-07-25 13:02:19,2172517,Canberra,2.48,431611
1,2023-07-25 13:00:07,2147714,Sydney,10.31,5361466
2,2023-07-25 13:07:39,2073124,Darwin,15.99,146982
3,2023-07-25 12:58:44,2174003,Brisbane,14.2,2582007
4,2023-07-25 13:03:02,2078025,Adelaide,9.62,1378413


### Aggregations and group bys 

In [13]:
# get average temperature of all cities 
df_selected["temperature"].mean()

13.2425

In [14]:
# get total population of all cities 
df_selected["population"].sum()

17376986

In [15]:
# get the average temperature for each city 
df_selected.groupby(["name"]).agg({
    "temperature":"mean"
}).reset_index()

Unnamed: 0,name,temperature
0,Adelaide,9.62
1,Brisbane,14.2
2,Canberra,2.48
3,Darwin,15.99
4,Hobart,11.72
5,Melbourne,28.47
6,Perth,13.15
7,Sydney,10.31
