In [7]:
import pandas as pd
import requests
from tqdm import tqdm
from pathlib import Path
import os

In [8]:
os.getcwd()

'/Users/colleenking/Documents/college/programming_for_data_analytics/project'

In [10]:
data_dir = Path("./data")

if not data_dir.exists():
    os.mkdir(data_dir)
    os.mkdir(data_dir / "weather_data")

Read in the details of all weather stations - some lines aren't parsed correctly so these are skipped

We get the county, station name, the station id (station name), the height above sea level, location both in easting/northing and latitude/longitude, the open and close years

In [24]:
station_details_url = "http://cli.fusio.net/cli/climate_data/webdata/StationDetails.csv"

station_df = pd.read_csv(station_details_url, on_bad_lines='skip')

station_df.head()

Unnamed: 0,county,station name,name,height(m),easting,northing,latitude,longitude,open year,close year
0,Antrim,5880,LH_RATHLIN_WEST,10,309200,451800,55.30083,-6.28028,2000,(null)
1,Carlow,4415,TULLOW (Waterworks),76,284700,173400,52.80528,-6.74306,1985,(null)
2,Carlow,2414,BORRIS G.S.,85,272400,150700,52.60278,-6.93056,1944,1991
3,Carlow,1214,CARLOW (SUGAR FACTORY),58,272200,178400,52.85139,-6.92778,1953,1960
4,Carlow,115,HACKETSTOWN RECTORY,182,297600,180500,52.86667,-6.55,1910,1944


In [14]:
station_df.dtypes

county           object
station name      int64
name             object
height(m)         int64
easting           int64
northing          int64
latitude        float64
longitude       float64
open year        object
close year       object
dtype: object

The open and close years are stored as stings, but we want them as ints

In [26]:
station_df["close year"] = station_df["close year"].replace('(null)', None).astype('Int64')
station_df["open year"] = station_df["open year"].replace('(null)', None).astype('Int64')
station_df.head()

Unnamed: 0,county,station name,name,height(m),easting,northing,latitude,longitude,open year,close year
0,Antrim,5880,LH_RATHLIN_WEST,10,309200,451800,55.30083,-6.28028,2000,
1,Carlow,4415,TULLOW (Waterworks),76,284700,173400,52.80528,-6.74306,1985,
2,Carlow,2414,BORRIS G.S.,85,272400,150700,52.60278,-6.93056,1944,1991.0
3,Carlow,1214,CARLOW (SUGAR FACTORY),58,272200,178400,52.85139,-6.92778,1953,1960.0
4,Carlow,115,HACKETSTOWN RECTORY,182,297600,180500,52.86667,-6.55,1910,1944.0


In [27]:
station_df.dtypes

county           object
station name      int64
name             object
height(m)         int64
easting           int64
northing          int64
latitude        float64
longitude       float64
open year         Int64
close year        Int64
dtype: object

In [28]:
station_df.to_csv(data_dir / "stations.csv", index=False)

We can use the station id to download daily data automatically because the download url is predictable - replacing the station id gives the filename we're looking for (https://cli.fusio.net/cli/climate_data/webdata/dly[station_id].csv)

We also don't care about closed stations, so we only need the rows where close year is null

In [34]:
open_stations = list(station_df[~pd.isna(station_df["close year"])]["station name"])
len(open_stations)

1550

1550 is a lot of stations - each file is about 2mb, so in total this would be around 3Gb if we naively downloaded it all. We should just get a sample a few from each county instead.

5 stations * 26 counties * 2mb gives around 260mb, which is much more reasonable

In [39]:
# https://stackoverflow.com/questions/22472213/python-random-selection-per-group

sample_stations = station_df[~pd.isna(station_df["close year"])].groupby('county').apply(lambda x: x.sample(5)).reset_index(drop=True)

print(len(sample_stations))
sample_stations.head()


130


  sample_stations = station_df[~pd.isna(station_df["close year"])].groupby('county').apply(lambda x: x.sample(5)).reset_index(drop=True)


Unnamed: 0,county,station name,name,height(m),easting,northing,latitude,longitude,open year,close year
0,Carlow,6014,CLASHGANNA MILLS,27,273800,146200,52.5625,-6.91111,1987,1989
1,Carlow,4814,CARLOW (Oak Park),61,273000,179500,52.86111,-6.91528,1967,1996
2,Carlow,515,TULLOW (MT.ST.JOSEPH'S),79,285800,171800,52.79028,-6.72778,1949,1960
3,Carlow,5314,BAGENALSTOWN (FENAGH II),104,277700,161500,52.69917,-6.85111,1982,1989
4,Carlow,6914,GARRYHILL (MILLTOWN),107,278600,158700,52.67389,-6.8375,2003,2005


In [41]:
# url format: https://cli.fusio.net/cli/climate_data/webdata/dly875.csv
for station in tqdm(list(sample_stations['station name'])):
    try:
        url = f"https://cli.fusio.net/cli/climate_data/webdata/dly{station}.csv"
        response = requests.get(url)
        assert response.status_code == 200
        with open(data_dir / f"weather_data/{station}.csv", "wb") as f:
            f.write(response.content)
    except:
        print(f"Couldn't retrieve data for station {station}")

  2%|▋                                          | 2/130 [00:00<00:23,  5.44it/s]

Couldn't retrieve data for station 6014
Couldn't retrieve data for station 4814


  3%|█▎                                         | 4/130 [00:00<00:21,  5.92it/s]

Couldn't retrieve data for station 515
Couldn't retrieve data for station 5314


  5%|█▉                                         | 6/130 [00:01<00:21,  5.80it/s]

Couldn't retrieve data for station 6914
Couldn't retrieve data for station 1437


  6%|██▋                                        | 8/130 [00:01<00:21,  5.81it/s]

Couldn't retrieve data for station 2637
Couldn't retrieve data for station 1330


  8%|███▏                                      | 10/130 [00:01<00:19,  6.20it/s]

Couldn't retrieve data for station 2431
Couldn't retrieve data for station 1737


  9%|███▉                                      | 12/130 [00:02<00:19,  6.16it/s]

Couldn't retrieve data for station 1518
Couldn't retrieve data for station 1618


 11%|████▌                                     | 14/130 [00:02<00:20,  5.53it/s]

Couldn't retrieve data for station 1711
Couldn't retrieve data for station 1611


 12%|████▊                                     | 15/130 [00:02<00:20,  5.71it/s]

Couldn't retrieve data for station 118


 13%|█████▍                                    | 17/130 [00:02<00:20,  5.46it/s]

Couldn't retrieve data for station 1801
Couldn't retrieve data for station 3202


 15%|██████▍                                   | 20/130 [00:03<00:22,  4.97it/s]

Couldn't retrieve data for station 4406


 17%|███████                                   | 22/130 [00:04<00:20,  5.34it/s]

Couldn't retrieve data for station 1940
Couldn't retrieve data for station 1640


 18%|███████▍                                  | 23/130 [00:04<00:19,  5.63it/s]

Couldn't retrieve data for station 2142


 19%|████████                                  | 25/130 [00:04<00:18,  5.71it/s]

Couldn't retrieve data for station 641
Couldn't retrieve data for station 1840


 20%|████████▍                                 | 26/130 [00:04<00:17,  6.06it/s]

Couldn't retrieve data for station 1623


 21%|████████▋                                 | 27/130 [00:04<00:19,  5.16it/s]

Couldn't retrieve data for station 232


 22%|█████████▎                                | 29/130 [00:05<00:19,  5.15it/s]

Couldn't retrieve data for station 9623
Couldn't retrieve data for station 2923


 24%|██████████                                | 31/130 [00:05<00:19,  5.16it/s]

Couldn't retrieve data for station 10223
Couldn't retrieve data for station 1626


 25%|██████████▋                               | 33/130 [00:06<00:18,  5.30it/s]

Couldn't retrieve data for station 1025
Couldn't retrieve data for station 1028


 27%|███████████▎                              | 35/130 [00:06<00:16,  5.65it/s]

Couldn't retrieve data for station 1828
Couldn't retrieve data for station 325


 28%|███████████▉                              | 37/130 [00:06<00:15,  6.14it/s]

Couldn't retrieve data for station 709
Couldn't retrieve data for station 9705


 30%|████████████▌                             | 39/130 [00:07<00:13,  6.71it/s]

Couldn't retrieve data for station 3110
Couldn't retrieve data for station 4005


 32%|█████████████▏                            | 41/130 [00:07<00:14,  6.02it/s]

Couldn't retrieve data for station 2210
Couldn't retrieve data for station 114


 33%|█████████████▉                            | 43/130 [00:07<00:15,  5.53it/s]

Couldn't retrieve data for station 8323
Couldn't retrieve data for station 3714


 35%|██████████████▌                           | 45/130 [00:08<00:14,  5.87it/s]

Couldn't retrieve data for station 5114
Couldn't retrieve data for station 2914


 36%|███████████████▏                          | 47/130 [00:08<00:15,  5.44it/s]

Couldn't retrieve data for station 4013
Couldn't retrieve data for station 713


 38%|███████████████▊                          | 49/130 [00:08<00:14,  5.41it/s]

Couldn't retrieve data for station 2913
Couldn't retrieve data for station 7912


 39%|████████████████▍                         | 51/130 [00:09<00:13,  5.91it/s]

Couldn't retrieve data for station 2413
Couldn't retrieve data for station 3413


 41%|█████████████████                         | 53/130 [00:09<00:12,  6.14it/s]

Couldn't retrieve data for station 3313
Couldn't retrieve data for station 113


 42%|█████████████████▍                        | 54/130 [00:09<00:11,  6.46it/s]

Couldn't retrieve data for station 613


 43%|██████████████████                        | 56/130 [00:09<00:12,  5.93it/s]

Couldn't retrieve data for station 4614
Couldn't retrieve data for station 529


 44%|██████████████████▍                       | 57/130 [00:10<00:11,  6.16it/s]

Couldn't retrieve data for station 9929


 45%|██████████████████▋                       | 58/130 [00:10<00:12,  5.65it/s]

Couldn't retrieve data for station 2137


 46%|███████████████████▍                      | 60/130 [00:10<00:12,  5.44it/s]

Couldn't retrieve data for station 1436
Couldn't retrieve data for station 1129


 48%|████████████████████                      | 62/130 [00:11<00:11,  5.86it/s]

Couldn't retrieve data for station 110
Couldn't retrieve data for station 811


 49%|████████████████████▋                     | 64/130 [00:11<00:12,  5.45it/s]

Couldn't retrieve data for station 2411
Couldn't retrieve data for station 4411


 51%|█████████████████████▎                    | 66/130 [00:11<00:10,  6.05it/s]

Couldn't retrieve data for station 3811
Couldn't retrieve data for station 4429


 52%|█████████████████████▋                    | 67/130 [00:12<00:12,  5.17it/s]

Couldn't retrieve data for station 1629


 53%|██████████████████████▎                   | 69/130 [00:12<00:11,  5.27it/s]

Couldn't retrieve data for station 1230
Couldn't retrieve data for station 2329


 55%|██████████████████████▉                   | 71/130 [00:12<00:10,  5.57it/s]

Couldn't retrieve data for station 2029
Couldn't retrieve data for station 1138


 56%|███████████████████████▌                  | 73/130 [00:12<00:08,  6.40it/s]

Couldn't retrieve data for station 2838
Couldn't retrieve data for station 2538


 57%|███████████████████████▉                  | 74/130 [00:13<00:08,  6.28it/s]

Couldn't retrieve data for station 1738


 59%|████████████████████████▉                 | 77/130 [00:14<00:12,  4.41it/s]

Couldn't retrieve data for station 3835
Couldn't retrieve data for station 934


 61%|█████████████████████████▌                | 79/130 [00:14<00:09,  5.12it/s]

Couldn't retrieve data for station 626
Couldn't retrieve data for station 1833


 62%|█████████████████████████▊                | 80/130 [00:14<00:09,  5.09it/s]

Couldn't retrieve data for station 434


 63%|██████████████████████████▍               | 82/130 [00:15<00:09,  4.83it/s]

Couldn't retrieve data for station 132
Couldn't retrieve data for station 4431


 65%|███████████████████████████▏              | 84/130 [00:15<00:09,  4.81it/s]

Couldn't retrieve data for station 2531
Couldn't retrieve data for station 638


 65%|███████████████████████████▍              | 85/130 [00:15<00:08,  5.19it/s]

Couldn't retrieve data for station 1432


 67%|████████████████████████████              | 87/130 [00:16<00:08,  5.34it/s]

Couldn't retrieve data for station 938
Couldn't retrieve data for station 3138


 68%|████████████████████████████▊             | 89/130 [00:16<00:09,  4.51it/s]

Couldn't retrieve data for station 2437


 70%|█████████████████████████████▍            | 91/130 [00:16<00:08,  4.70it/s]

Couldn't retrieve data for station 239
Couldn't retrieve data for station 3122


 72%|██████████████████████████████▎           | 94/130 [00:17<00:07,  5.07it/s]

Couldn't retrieve data for station 1422
Couldn't retrieve data for station 6319


 74%|███████████████████████████████           | 96/130 [00:17<00:05,  5.74it/s]

Couldn't retrieve data for station 2322
Couldn't retrieve data for station 2729


 75%|███████████████████████████████▎          | 97/130 [00:18<00:05,  6.18it/s]

Couldn't retrieve data for station 6429


 75%|███████████████████████████████▋          | 98/130 [00:18<00:05,  5.60it/s]

Couldn't retrieve data for station 4829


 77%|███████████████████████████████▌         | 100/130 [00:18<00:05,  5.83it/s]

Couldn't retrieve data for station 3429
Couldn't retrieve data for station 6629


 78%|███████████████████████████████▊         | 101/130 [00:18<00:04,  6.31it/s]

Couldn't retrieve data for station 936


 79%|████████████████████████████████▍        | 103/130 [00:19<00:04,  6.08it/s]

Couldn't retrieve data for station 2235
Couldn't retrieve data for station 1536


 82%|█████████████████████████████████▍       | 106/130 [00:19<00:04,  5.26it/s]

Couldn't retrieve data for station 1035
Couldn't retrieve data for station 6612


 82%|█████████████████████████████████▋       | 107/130 [00:20<00:04,  4.81it/s]

Couldn't retrieve data for station 9812


 83%|██████████████████████████████████       | 108/130 [00:20<00:05,  4.39it/s]

Couldn't retrieve data for station 7212


 84%|██████████████████████████████████▍      | 109/130 [00:20<00:04,  4.43it/s]

Couldn't retrieve data for station 1013


 85%|██████████████████████████████████▋      | 110/130 [00:20<00:04,  4.48it/s]

Couldn't retrieve data for station 912


 87%|███████████████████████████████████▋     | 113/130 [00:21<00:03,  4.43it/s]

Couldn't retrieve data for station 907
Couldn't retrieve data for station 9712


 88%|███████████████████████████████████▉     | 114/130 [00:21<00:03,  4.93it/s]

Couldn't retrieve data for station 3306


 89%|████████████████████████████████████▌    | 116/130 [00:22<00:02,  5.02it/s]

Couldn't retrieve data for station 8512
Couldn't retrieve data for station 2722


 91%|█████████████████████████████████████▏   | 118/130 [00:22<00:02,  5.40it/s]

Couldn't retrieve data for station 2130
Couldn't retrieve data for station 1122


 92%|█████████████████████████████████████▊   | 120/130 [00:22<00:01,  5.89it/s]

Couldn't retrieve data for station 1922
Couldn't retrieve data for station 2122


 94%|██████████████████████████████████████▍  | 122/130 [00:23<00:01,  5.81it/s]

Couldn't retrieve data for station 3815
Couldn't retrieve data for station 1115


 95%|███████████████████████████████████████  | 124/130 [00:23<00:01,  5.45it/s]

Couldn't retrieve data for station 308
Couldn't retrieve data for station 1215


 97%|███████████████████████████████████████▋ | 126/130 [00:23<00:00,  6.30it/s]

Couldn't retrieve data for station 508
Couldn't retrieve data for station 1520


 98%|████████████████████████████████████████▎| 128/130 [00:24<00:00,  6.19it/s]

Couldn't retrieve data for station 2515
Couldn't retrieve data for station 1824


100%|█████████████████████████████████████████| 130/130 [00:24<00:00,  5.32it/s]

Couldn't retrieve data for station 3424
Couldn't retrieve data for station 815





It looks like a lot of the stations' data isn't available via these urls - of the 130 expected, only 7 were actually downloaded, or a little over 5%. If that holds true across the whole dataset, 5% of the 1550 stations should give us an expected 83 files, totalling ~160mb, which would be manageable.

In [44]:
1550*(7/130)

83.46153846153847

In [42]:
# this station gives a 404 Not Found status code - the file doesn't exist at this url
pd.read_csv(f"https://cli.fusio.net/cli/climate_data/webdata/dly6014.csv")

HTTPError: HTTP Error 404: Not Found

In [45]:
files_downloaded = 0
for station in tqdm(open_stations):
    try:
        url = f"https://cli.fusio.net/cli/climate_data/webdata/dly{station}.csv"
        response = requests.get(url)
        assert response.status_code == 200
        with open(data_dir / f"weather_data/{station}.csv", "wb") as f:
            f.write(response.content)
        files_downloaded += 1
    except:
        pass
print(f'Downloaded {files_downloaded} files')

100%|███████████████████████████████████████| 1550/1550 [03:15<00:00,  7.94it/s]

Downloaded 38 files





We only got 38 files in the end, we may have gotten lucky with the sample that had a hit rate of 5%.

Now that we have all our files downloaded, we need to read them into pandas so we can analyse them.
Each file has a header that we need to skip, however the number of rows changes from file to file. The row we want always starts with "date", so we can use this 
We also need to keep track of the station id so 