# `create_weather_all.ipynb`

### Author: Anthony Hein

#### Last updated: 10/18/2021

# Overview:

Concatenates `hlyXXX.csv` files downloaded from [https://cli.fusio.net/cli/climate_data/showdata.php](https://cli.fusio.net/cli/climate_data/showdata.php) into one file `weather_all.csv`.

---

## Setup

In [15]:
from datetime import datetime
import git
import os
import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

In [16]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Find Files

In [17]:
hly_x_csvs = glob.glob(f"{BASE_DIR}/raw/csv/weather/hly*.csv")
hly_x_csvs

['/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly375.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly2175.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly4935.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly175.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly575.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly2615.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly4919.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly3904.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly3723.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly775.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly2275.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly275.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly2075.csv',
 '/Users/anthonyhein/Desktop/SML310/project

In [18]:
len(hly_x_csvs)

24

---

## Extract Dataframes

In [19]:
dfs = [pd.read_csv(e, low_memory=False) for e in hly_x_csvs]
dfs[0].head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl,ind.3,wdsp,ind.4,wddir
0,8/1/03 1:00,-1,,4,,4,,,,,,7,,7,
1,8/1/03 2:00,-1,,4,,4,,,,,,7,,7,
2,8/1/03 3:00,-1,,4,,4,,,,,,7,,7,
3,8/1/03 4:00,-1,,4,,4,,,,,,7,,7,
4,8/1/03 5:00,-1,,4,,4,,,,,,7,,7,


---

## Get Intersection of Available Features

In [20]:
intr = set.intersection(*[set(df.columns) for df in dfs])
intr

{'date',
 'dewpt',
 'ind',
 'ind.1',
 'ind.2',
 'msl',
 'rain',
 'rhum',
 'temp',
 'vappr',
 'wetb'}

In [21]:
dfs = [df[intr] for df in dfs]

---

## Prune Uneeded Columns

In [22]:
keep = ['date', 'temp', 'msl', 'rain', 'rhum']

In [23]:
dfs = [df[keep] for df in dfs]

---

## Drop Entries Before 1990 or After 2020

In [24]:
def entry_in_range(row: pd.core.series.Series) -> bool:
    return datetime.strptime(row['date'], '%m/%d/%y %H:%M') > datetime(1990,1,1) and \
           datetime.strptime(row['date'], '%m/%d/%y %H:%M') < datetime(2021,1,1)

In [25]:
dfs = [
    df[[entry_in_range(row) for _, row in df.iterrows()]]
    for df
    in tqdm(dfs)
]
dfs[0].head()


  0%|                                                                                                                                                                | 0/24 [00:00<?, ?it/s][A
  4%|██████▎                                                                                                                                                 | 1/24 [00:10<04:03, 10.60s/it][A
  8%|████████████▋                                                                                                                                           | 2/24 [00:28<05:33, 15.15s/it][A
 12%|███████████████████                                                                                                                                     | 3/24 [00:43<05:12, 14.87s/it][A
 17%|█████████████████████████▎                                                                                                                              | 4/24 [00:54<04:27, 13.39s/it][A
 21%|███████████████████████████████▋  

Unnamed: 0,date,temp,msl,rain,rhum
0,8/1/03 1:00,,,,
1,8/1/03 2:00,,,,
2,8/1/03 3:00,,,,
3,8/1/03 4:00,,,,
4,8/1/03 5:00,,,,


---

## Add Station Number

In [26]:
station_numbers = [
    int(e[len('/Users/anthonyhein/Desktop/SML310/project/raw/csv/weather/hly'):][:-len('.csv')])
    for e
    in hly_x_csvs
]

In [27]:
station_numbers

[375,
 2175,
 4935,
 175,
 575,
 2615,
 4919,
 3904,
 3723,
 775,
 2275,
 275,
 2075,
 1975,
 675,
 518,
 875,
 532,
 3613,
 1175,
 1375,
 1475,
 2437,
 1275]

In [28]:
for idx, df in enumerate(dfs):
    df['Station number'] = station_numbers[idx]

In [29]:
dfs[0]

Unnamed: 0,date,temp,msl,rain,rhum,Station number
0,8/1/03 1:00,,,,,375
1,8/1/03 2:00,,,,,375
2,8/1/03 3:00,,,,,375
3,8/1/03 4:00,,,,,375
4,8/1/03 5:00,,,,,375
...,...,...,...,...,...,...
152706,12/31/20 19:00,2.9,1010.2,0.2,94,375
152707,12/31/20 20:00,3.4,1010.8,0.4,95,375
152708,12/31/20 21:00,3.8,1011.3,0.6,96,375
152709,12/31/20 22:00,4.2,1011.9,0,93,375


---

## Concatenate Dataframes

In [30]:
weather_all = pd.concat(dfs)
weather_all

Unnamed: 0,date,temp,msl,rain,rhum,Station number
0,8/1/03 1:00,,,,,375
1,8/1/03 2:00,,,,,375
2,8/1/03 3:00,,,,,375
3,8/1/03 4:00,,,,,375
4,8/1/03 5:00,,,,,375
...,...,...,...,...,...,...
137250,12/31/20 19:00,5.6,1013.7,0,75,1275
137251,12/31/20 20:00,5.4,1014.3,0,74,1275
137252,12/31/20 21:00,4.8,1014.9,0,74,1275
137253,12/31/20 22:00,5,1015.4,0,74,1275


In [31]:
weather_all.shape

(4473041, 6)

In [32]:
assert len(weather_all) == sum([len(df) for df in dfs])

---

## Clean Dataframes (Remove Rows w/ Missing Values)

In [33]:
weather_all = weather_all.replace(' ', pd.NA)
weather_all

Unnamed: 0,date,temp,msl,rain,rhum,Station number
0,8/1/03 1:00,,,,,375
1,8/1/03 2:00,,,,,375
2,8/1/03 3:00,,,,,375
3,8/1/03 4:00,,,,,375
4,8/1/03 5:00,,,,,375
...,...,...,...,...,...,...
137250,12/31/20 19:00,5.6,1013.7,0,75,1275
137251,12/31/20 20:00,5.4,1014.3,0,74,1275
137252,12/31/20 21:00,4.8,1014.9,0,74,1275
137253,12/31/20 22:00,5,1015.4,0,74,1275


In [34]:
weather_all = weather_all.dropna()
weather_all

Unnamed: 0,date,temp,msl,rain,rhum,Station number
296,8/13/03 9:00,17.3,1023.6,0,72,375
297,8/13/03 10:00,18.5,1023.7,0,65,375
298,8/13/03 11:00,19.3,1023.7,0,56,375
299,8/13/03 12:00,20.4,1023.3,0,53,375
300,8/13/03 13:00,20.9,1023.3,0,54,375
...,...,...,...,...,...,...
137250,12/31/20 19:00,5.6,1013.7,0,75,1275
137251,12/31/20 20:00,5.4,1014.3,0,74,1275
137252,12/31/20 21:00,4.8,1014.9,0,74,1275
137253,12/31/20 22:00,5,1015.4,0,74,1275


---

## Save Dataframe

In [35]:
weather_all.to_csv(f"{BASE_DIR}/data/csv/weather_all.csv", index=False)

---