In [None]:
import pandas as pd
import os
import urllib

# Download the data

If `wget` is not installed on your machine, you may want to try `curl URL > file.txt` or the `urllib` package in python.

In [None]:
%%bash
mkdir data_ghcn
cd data_ghcn
wget https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
wget https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2021.csv.gz
gzip -d 2021.csv.gz

--2022-11-09 21:46:06--  https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
Resolving www1.ncdc.noaa.gov (www1.ncdc.noaa.gov)... 205.167.25.177, 205.167.25.172, 205.167.25.168, ...
Connecting to www1.ncdc.noaa.gov (www1.ncdc.noaa.gov)|205.167.25.177|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10593738 (10M) [text/plain]
Saving to: ‘ghcnd-stations.txt’

     0K .......... .......... .......... .......... ..........  0% 1.40M 7s
    50K .......... .......... .......... .......... ..........  0% 2.80M 5s
   100K .......... .......... .......... .......... ..........  1% 53.6M 4s
   150K .......... .......... .......... .......... ..........  1% 2.83M 4s
   200K .......... .......... .......... .......... ..........  2%  147M 3s
   250K .......... .......... .......... .......... ..........  2%  175M 2s
   300K .......... .......... .......... .......... ..........  3% 2.99M 2s
   350K .......... .......... .......... .......... ..........  3%  

# Data Preprocessing

In [None]:
def get_vals(line):
    ls = line.split(',')
    station = ls[0]
    time = ls[1]
    val = float(ls[3])
    return [station, time, val]

def get_stations(filename='data_ghcn/ghcnd-stations.txt'):
    df = pd.read_csv(filename, '/t', header=None)
    df = df[0].str.split(expand=True)[[0, 1, 2, 3]]
    df.columns = ['Station', 'Latitude', 'Longitude', 'Elevation']
    return df

def process_year(year, stations, col='TAVG', basedir='data_ghcn'):
    tavg = []
    with open(os.path.join(basedir, "%s.csv" % year)) as h:
        l = h.readline()
        while l:
            if col in l:
                v = get_vals(l)
                if stations['Station'].str.contains(v[0]).any():
                    tavg.append(get_vals(l))
            l = h.readline()
    df_tavg = pd.DataFrame(tavg, columns=['Station', 'Date', col])
    df_merged = df_tavg.merge(stations, left_on='Station', right_on='Station', how='left')
    df_merged['Date'] = df_merged['Date'].apply(pd.Timestamp)
    for c in ['Latitude', 'Longitude', col, 'Elevation']:
        df_merged[c] = df_merged[c].astype(float)
    return df_merged[['Station', 'Date', col, 'Latitude', 'Longitude', 'Elevation']]

In [None]:
stations = get_stations()
df1 = process_year('2021', stations, col='TAVG')
stations = stations[stations.Station.isin(df1.Station)]

  """Entry point for launching an IPython kernel.
  return func(*args, **kwargs)


In [None]:
df2 = process_year('2021', stations, col='PRCP')

In [None]:
df = df1.merge(df2[['Station', 'Date', 'PRCP']], on=['Station', 'Date'])
# df.to_csv('data_ghcn/daily_global_weather_2020.csv')
df