In [1]:
import requests
import pandas as pd 
from secrets_config import api_key # https://home.openweathermap.org/ 

In [9]:
# how to extract data from multiple cities? e.g. canberra, sydney, etc

# 1. create a list of cities (csv)
# 2. read list of cities (csv) 
# 3. request data for each city (json) and push to a list 
# 4. convert list into dataframe 

In [2]:
# read list of cities
df_cities = pd.read_csv("data/australian_capital_cities.csv")
df_cities.head()

Unnamed: 0,city_name
0,canberra
1,sydney
2,darwin
3,brisbane
4,adelaide


In [3]:
# request data for each city (json) and push to a list 
weather_data = []
for city_name in df_cities["city_name"]:
    params = {
        "q": city_name,
        "units": "metric",
        "appid": api_key
    }
    response = requests.get(f"http://api.openweathermap.org/data/2.5/weather", params=params)
    if response.status_code == 200: 
        weather_data.append(response.json())
    else: 
        raise Exception("Extracting weather api data failed. Please check if API limits have been reached.")

In [4]:
# show requested data 
print(weather_data)

[{'coord': {'lon': 149.1281, 'lat': -35.2835}, 'weather': [{'id': 800, 'main': 'Clear', 'description': 'clear sky', 'icon': '01n'}], 'base': 'stations', 'main': {'temp': 1.83, 'feels_like': 1.83, 'temp_min': -0.01, 'temp_max': 5.37, 'pressure': 1036, 'humidity': 91}, 'visibility': 10000, 'wind': {'speed': 0, 'deg': 0}, 'clouds': {'all': 0}, 'dt': 1690292742, 'sys': {'type': 2, 'id': 2081752, 'country': 'AU', 'sunrise': 1690232666, 'sunset': 1690269314}, 'timezone': 36000, 'id': 2172517, 'name': 'Canberra', 'cod': 200}, {'coord': {'lon': 151.2073, 'lat': -33.8679}, 'weather': [{'id': 800, 'main': 'Clear', 'description': 'clear sky', 'icon': '01n'}], 'base': 'stations', 'main': {'temp': 9.28, 'feels_like': 7.59, 'temp_min': 5.69, 'temp_max': 11.24, 'pressure': 1035, 'humidity': 87}, 'visibility': 10000, 'wind': {'speed': 3.09, 'deg': 320}, 'clouds': {'all': 0}, 'dt': 1690293155, 'sys': {'type': 2, 'id': 2010638, 'country': 'AU', 'sunrise': 1690231986, 'sunset': 1690268996}, 'timezone': 3

In [5]:
# convert list into dataframe 
df_weather_cities = pd.json_normalize(weather_data)
df_weather_cities.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,wind.gust,rain.1h
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690292742,36000,2172517,Canberra,200,149.1281,-35.2835,...,0.0,0,0,2,2081752,AU,1690232666,1690269314,,
1,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690293155,36000,2147714,Sydney,200,151.2073,-33.8679,...,3.09,320,0,2,2010638,AU,1690231986,1690268996,,
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690293367,34200,2073124,Darwin,200,130.8418,-12.4611,...,1.54,80,0,1,9574,AU,1690234657,1690276101,,
3,"[{'id': 801, 'main': 'Clouds', 'description': ...",stations,10000,1690293080,36000,2174003,Brisbane,200,153.0281,-27.4679,...,3.09,210,20,2,2012892,AU,1690230802,1690269306,,
4,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690292877,34200,2078025,Adelaide,200,138.6,-34.9333,...,1.79,106,0,2,2001763,AU,1690235146,1690271887,3.58,


### Transforming data

In [6]:
# set city names to lowercase 
df_weather_cities["city_name"] = df_weather_cities["name"].str.lower()

In [7]:
df_population = pd.read_csv("data/australian_city_population.csv")
df_population.head()

Unnamed: 0,city_name,population
0,canberra,431611
1,sydney,5361466
2,darwin,146982
3,brisbane,2582007
4,adelaide,1378413


In [8]:
df_merged = pd.merge(left=df_weather_cities, right=df_population, on=["city_name"])
df_merged.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,wind.gust,rain.1h,city_name,population
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690292742,36000,2172517,Canberra,200,149.1281,-35.2835,...,0,2,2081752,AU,1690232666,1690269314,,,canberra,431611
1,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690293155,36000,2147714,Sydney,200,151.2073,-33.8679,...,0,2,2010638,AU,1690231986,1690268996,,,sydney,5361466
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690293367,34200,2073124,Darwin,200,130.8418,-12.4611,...,0,1,9574,AU,1690234657,1690276101,,,darwin,146982
3,"[{'id': 801, 'main': 'Clouds', 'description': ...",stations,10000,1690293080,36000,2174003,Brisbane,200,153.0281,-27.4679,...,20,2,2012892,AU,1690230802,1690269306,,,brisbane,2582007
4,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1690292877,34200,2078025,Adelaide,200,138.6,-34.9333,...,0,2,2001763,AU,1690235146,1690271887,3.58,,adelaide,1378413


In [9]:
# print out all columns and pick only relevant ones 
df_merged.columns

Index(['weather', 'base', 'visibility', 'dt', 'timezone', 'id', 'name', 'cod',
       'coord.lon', 'coord.lat', 'main.temp', 'main.feels_like',
       'main.temp_min', 'main.temp_max', 'main.pressure', 'main.humidity',
       'wind.speed', 'wind.deg', 'clouds.all', 'sys.type', 'sys.id',
       'sys.country', 'sys.sunrise', 'sys.sunset', 'wind.gust', 'rain.1h',
       'city_name', 'population'],
      dtype='object')

In [10]:
df_selected = df_merged[["dt", "id", "name", "main.temp", "population"]] 
df_selected.head()

Unnamed: 0,dt,id,name,main.temp,population
0,1690292742,2172517,Canberra,1.83,431611
1,1690293155,2147714,Sydney,9.28,5361466
2,1690293367,2073124,Darwin,14.99,146982
3,1690293080,2174003,Brisbane,12.72,2582007
4,1690292877,2078025,Adelaide,9.58,1378413


In [11]:
df_selected.dtypes

dt              int64
id              int64
name           object
main.temp     float64
population      int64
dtype: object

In [12]:
df_selected["unique_id"] = df_selected["dt"].astype(str) + df_selected["id"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["unique_id"] = df_selected["dt"].astype(str) + df_selected["id"].astype(str)


In [13]:
# convert unix timestamp column to datetime 
df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")
df_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")


Unnamed: 0,dt,id,name,main.temp,population,unique_id
0,2023-07-25 13:45:42,2172517,Canberra,1.83,431611,16902927422172517
1,2023-07-25 13:52:35,2147714,Sydney,9.28,5361466,16902931552147714
2,2023-07-25 13:56:07,2073124,Darwin,14.99,146982,16902933672073124
3,2023-07-25 13:51:20,2174003,Brisbane,12.72,2582007,16902930802174003
4,2023-07-25 13:47:57,2078025,Adelaide,9.58,1378413,16902928772078025


In [14]:
# rename colum names to more meaningful names
df_selected = df_selected.rename(columns={
    "dt": "datetime",
    "main.temp": "temperature"
})
df_selected.head()

Unnamed: 0,datetime,id,name,temperature,population,unique_id
0,2023-07-25 13:45:42,2172517,Canberra,1.83,431611,16902927422172517
1,2023-07-25 13:52:35,2147714,Sydney,9.28,5361466,16902931552147714
2,2023-07-25 13:56:07,2073124,Darwin,14.99,146982,16902933672073124
3,2023-07-25 13:51:20,2174003,Brisbane,12.72,2582007,16902930802174003
4,2023-07-25 13:47:57,2078025,Adelaide,9.58,1378413,16902928772078025


In [15]:
df_selected = df_selected.set_index(["unique_id"])

### Load data to file (parquet)

Overwrite

In [16]:
# load (overwrite) data to a csv file 
df_selected.to_parquet("data/weather.parquet")

Append new files

In [18]:
import datetime as dt 
current_timestamp = dt.datetime.now().isoformat().replace(":","-")
df_selected.to_parquet(f"data/weather_{current_timestamp}.parquet")

### Load data to SQL 

In [16]:
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, Float # https://www.tutorialspoint.com/sqlalchemy/sqlalchemy_core_creating_table.htm
from sqlalchemy.engine import URL
from sqlalchemy.dialects import postgresql
from secrets_config import db_user, db_password, db_server_name, db_database_name
from sqlalchemy.schema import CreateTable 

In [18]:
# create connection to database 
connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = db_user,
    password = db_password,
    host = db_server_name, 
    port = 5432,
    database = db_database_name, 
)

engine = create_engine(connection_url)


Append

In [19]:
# using pandas 
df_selected.to_sql("weather_ins", engine, if_exists="append")

8

Overwrite

In [None]:
# using pandas 
df_selected.to_sql("weather_ins", engine, if_exists="replace")

Upsert

In [20]:
meta = MetaData()
weather_table = Table(
    "weather", meta, 
    Column("datetime", String, primary_key=True),
    Column("id", Integer, primary_key=True),
    Column("name", String),
    Column("temperature", Float),
    Column("population", Integer)
)
meta.create_all(engine) # creates table if it does not exist 


In [21]:
insert_statement = postgresql.insert(weather_table).values(df_selected.to_dict(orient='records'))
upsert_statement = insert_statement.on_conflict_do_update(
    index_elements=['id', 'datetime'],
    set_={c.key: c for c in insert_statement.excluded if c.key not in ['id', 'datetime']})
engine.execute(upsert_statement)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fc0177394c0>