In [2]:
import pandas as pd, os, dill as pickle, csv
from tqdm import tqdm, trange
from utils.transform import getVedurLonLatInISN93
from datetime import date
from itertools import islice

In [None]:
def createStationsLonLatXY(stodTxtPath: str = 'D:/Skóli/lokaverkefni_vel/data/Vedurstofa/stod.txt', outputPath: str = 'D:/Skóli/lokaverkefni_vel/data/Vedurstofa/stationsLonLatXY.pkl', encoding: str = 'ISO-8859-1'):
    stationsDict = {}
    with open(stodTxtPath, 'r', encoding = encoding) as f:
        stations = [a.strip().split(',') for a in f.readlines()][1:]
        stations = [[int(a[0]), a[1], float(a[2]), float(a[3]), float(a[4]) if a[4].isnumeric() else a[4], a[5]] for a in stations]
    for station in stations:
        latitude, longitude = station[2], station[3]
        x, y = getVedurLonLatInISN93(longitude, latitude)
        stationsDict[station[0]] = (-longitude, latitude, x, y)

    with open(outputPath, 'wb') as f:
        pickle.dump(stationsDict, f)

In [None]:
def tooClose(dt1, dt2, threshold):
    return abs((dt1 - dt2)) < pd.Timedelta(threshold, 's')

In [None]:
#def combine10min(path_10min: str = 'E:/Skóli/HÍ/Vélaverkfræði Master HÍ/Lokaverkefni/Data/Vedurstofa/10min/', outputpath: str = 'E:/Skóli/HÍ/Vélaverkfræði Master HÍ/Lokaverkefni/Data/Vedurstofa/combined_10min_20ms.feather') -> None:
def combine10min(path_10min: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/10min/', outputpath: str = 'D:/Skóli/lokaverkefni_vel/data/Vedurstofa/combined_10min_20ms_25_3_24.feather'):
    files = [os.path.join(path_10min, file) for file in os.listdir(path_10min)]
    columns, df = None, None

    for file in tqdm(files, total = len(files)):
        with open(file, 'r') as f:
            reader = list(csv.reader(f))
            
        if not columns:
            columns = reader[0]
        
        lines = reader[1:]

        if df is None:
            df = pd.DataFrame(lines, columns = columns)
        else:
            df = pd.concat([df, pd.DataFrame(lines, columns = columns)])
        
        df.f = pd.to_numeric(df.f, errors = 'coerce')
        #df = df[df.f >= 20]

    df.timi = pd.to_datetime(df.timi)
    df.stod = df.stod.astype(int)
    df.f = pd.to_numeric(df.f, errors = 'coerce')
    df.fg = pd.to_numeric(df.fg, errors = 'coerce')
    df.fsdev = pd.to_numeric(df.fsdev, errors = 'coerce')
    df.d = pd.to_numeric(df.d, errors = 'coerce')
    df.dsdev = pd.to_numeric(df.dsdev, errors = 'coerce')

    df.to_feather(outputpath)

In [1]:
#def combine10min(path_10min: str = 'E:/Skóli/HÍ/Vélaverkfræði Master HÍ/Lokaverkefni/Data/Vedurstofa/10min/', outputpath: str = 'E:/Skóli/HÍ/Vélaverkfræði Master HÍ/Lokaverkefni/Data/Vedurstofa/combined_10min_20ms.feather') -> None:
def combine10minText(path_10min: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/10min/', outputpath: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/combined_10min/combined_10min.txt'):
    files = [os.path.join(path_10min, file) for file in os.listdir(path_10min)]

    for file in tqdm(files, total = len(files)):
        with open(file, 'r') as f:
            lines = f.readlines()
        
        lines = lines[1:]
        print(lines[0])
        with open(outputpath, 'a+') as f:
            f.writelines(lines)

In [4]:
# https://stackoverflow.com/questions/6335839/python-how-to-read-n-number-of-lines-at-a-time
def convertTimiF_Combined10min_toFeather(text_path: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/combined_10min/combined_10min.txt', 
                                         outputpath: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/combined_10min/combined_10min'):
    chunk_size = int(1e6)
    times, windspeeds, j = [], [], 0
    to_break = False
    with open(text_path, 'r') as f:
        while True:
            next_chunk = [f.readline() for _ in range(chunk_size)]
            if '' in next_chunk:
                next_chunk = [item for item in next_chunk if item]
                to_break = True
            next_chunk = [item.rstrip().split(',') for item in next_chunk]
            try:
                timi = [item[0] for item in next_chunk]
                windspeed = [item[2] for item in next_chunk]
            except:
                print(f'Something went wrong in the {j}th section:')
                print(next_chunk)
                return

            times.extend(timi)
            windspeeds.extend(windspeed)
            if to_break:
                break
            j += 1

    print(f"Finished all {j-1} sections")
    time_df = pd.DataFrame(times, columns = ['timi'])
    time_df.timi = pd.to_datetime(time_df.timi, errors = 'coerce')
    fs_df = pd.DataFrame(windspeeds, columns = ['f'])
    fs_df.f = pd.to_numeric(fs_df.f, errors = 'coerce')

    time_df.to_feather(outputpath + '-timi-' + date.today().strftime("%Y-%m-%d") + '.feather')
    fs_df.to_feather(outputpath + '-fs-' + date.today().strftime("%Y-%m-%d") + '.feather')

    print(f"Done writing all times and wind speeds")




In [None]:
def combineKLST(directory: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Vedurstofa/', outputpath: str = 'D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Vedurstofa/combined_klst.feather'):
    files = []
    for folder in ['klst/', 'vg/']:
        files.extend([directory + folder + file for file in os.listdir(directory + folder)])

    columns = ['timi','stod','f','fx','fg','d']
    data = []
    for file in tqdm(files, total = len(files)):
        with open(file, 'r') as f:
            lines = [line for line in csv.reader(f)]
        if 'dsdev' in lines[0]:
            lines = [line[:-1] for line in lines]
        lines = lines[1:]
        data.extend(lines)

    df = pd.DataFrame(data, columns = columns)
    df.stod = pd.to_numeric(df.stod, errors = 'coerce')
    df.timi = pd.to_datetime(df.timi, errors = 'coerce')
    df.fx = pd.to_numeric(df.fx, errors = 'coerce')
    df.f = pd.to_numeric(df.f, errors = 'coerce')
    df.fg = pd.to_numeric(df.fg, errors = 'coerce')
    df.d = pd.to_numeric(df.d, errors = 'coerce')
    df.to_feather(outputpath)

In [None]:
#def filterWithThreshold(vedurPath: str = 'E:/Skóli/HÍ/Vélaverkfræði Master HÍ/Lokaverkefni/Data/Vedurstofa/combined_10min_20ms.feather', outputpath: str = 'E:/Skóli/HÍ/Vélaverkfræði Master HÍ/Lokaverkefni/Data/Vedurstofa/combined_10min_20ms_24hr.feather', threshold: str = '1 day'):
def filterWithThreshold(vedurPath: str = 'D:/Skoli/lokaverkefni_vel/data/Vedurstofa/combined_10min_20ms_25_3_24.feather', outputpath: str = 'D:/Skóli/lokaverkefni_vel/data/Vedurstofa/combined_10min_20ms_25_3_24_24hr.feather', threshold: str = '1 day'):
    vedur_df = pd.read_feather(vedurPath)
    filtered_data, columns, stations = [], vedur_df.columns, vedur_df.stod.unique()

    print(f'The shape of the unfiltered dataframe is {vedur_df.shape}')

    for station in tqdm(stations, total = len(stations)):
        subset_df = vedur_df[station == vedur_df.stod]
        subset_df = subset_df.reset_index(drop = True)

        while not subset_df.empty:
            idx = subset_df.f.idxmax()
            time_of_max = subset_df.iloc[idx].timi

            filtered_data.append(subset_df.iloc[idx])

            subset_df = subset_df[abs(subset_df.timi - time_of_max) >= pd.Timedelta(threshold)]

            subset_df = subset_df.reset_index(drop = True)

    filtered_df = pd.DataFrame(filtered_data, columns=columns)

    filtered_df = filtered_df.sort_values(by=['stod', 'timi'])

    filtered_df = filtered_df.reset_index(drop=True)

    print(f'The shape of the filtered dataframe is {filtered_df.shape}')

    filtered_df.to_feather(outputpath)

In [None]:
combine10min()

In [None]:
combineKLST()

In [None]:
filterWithThreshold()

In [3]:
combine10minText()

  0%|          | 0/327 [00:00<?, ?it/s]

2005-02-09 16:00:00,1350,1.81,6.33,2.17,197.6,26.1



  0%|          | 1/327 [00:02<12:20,  2.27s/it]

2008-09-12 18:20:00,1361,8.91,13.46,1.68,142.2,6.7



  1%|          | 3/327 [00:04<06:48,  1.26s/it]

2005-08-24 10:40:00,1362,10.5,14.7,,17,14.9



  1%|          | 4/327 [00:04<04:36,  1.17it/s]

2005-08-22 22:10:00,1368,5.61,7.15,0.7,153.5,6.9

2005-08-22 21:10:00,1370,3.04,4.73,0.52,152.1,7.2



  2%|▏         | 5/327 [00:05<04:18,  1.24it/s]

2005-08-24 10:10:00,1391,8.9,13.2,,6,8.9



  2%|▏         | 6/327 [00:05<03:44,  1.43it/s]

2005-10-06 17:40:00,1395,5.85,7.57,1,205.6,6.1



  2%|▏         | 7/327 [00:07<05:21,  1.00s/it]

2005-08-22 22:10:00,1453,3.51,5.09,,125.9,7



  3%|▎         | 9/327 [00:09<05:18,  1.00s/it]

2022-08-22 11:00:00,1469,1.7,3.8,,83,

2021-05-19 16:30:00,1470,0,0,0,0,0



  3%|▎         | 10/327 [00:10<04:05,  1.29it/s]

2019-12-10 11:40:00,1471,9.3,11.6,,330,



  4%|▎         | 12/327 [00:10<02:43,  1.93it/s]

2021-11-02 15:50:00,1472,4.8,8.89,1.67,357.6,21.6

2005-08-22 22:10:00,1473,5.15,8.59,0.99,156.4,7.4



  4%|▍         | 13/327 [00:12<04:46,  1.09it/s]

2019-02-11 09:00:00,1474,5.1,9.8,,59,



  4%|▍         | 14/327 [00:12<03:57,  1.32it/s]

2005-08-22 22:10:00,1475,2.8,4.44,0.5,137.5,10.7



  5%|▍         | 15/327 [00:15<06:16,  1.21s/it]

2005-08-22 19:20:00,1477,3.24,4.31,0.4,143.7,6.2



  5%|▍         | 16/327 [00:16<07:00,  1.35s/it]

2019-09-27 12:00:00,1478,1.56,2.05,0.26,168.5,11



  5%|▌         | 17/327 [00:17<05:28,  1.06s/it]

2005-08-22 22:10:00,1479,3.15,4.24,,129.4,8.3



  6%|▌         | 18/327 [00:19<06:56,  1.35s/it]

2005-08-22 22:10:00,1480,4.61,6.01,0.46,101.9,5.3



  6%|▌         | 19/327 [00:21<07:43,  1.51s/it]

2006-01-10 18:00:00,1481,6.11,7.87,0.64,88.4,3.6



  6%|▌         | 20/327 [00:23<08:49,  1.72s/it]

2018-11-28 16:40:00,1482,7.7,13.46,2.19,33.3,11.6



  6%|▋         | 21/327 [00:23<06:58,  1.37s/it]

2005-08-22 21:10:00,1483,4.04,5.25,0.51,128.7,5.9



  7%|▋         | 22/327 [00:24<05:46,  1.14s/it]

2005-05-01 00:00:00,1486,6.66,8.59,,40.2,4.9



  7%|▋         | 23/327 [00:26<07:15,  1.43s/it]

2005-08-22 22:10:00,1487,8.8,11.4,1.13,157.8,8.2



  7%|▋         | 24/327 [00:28<08:21,  1.66s/it]

2005-08-22 21:10:00,1490,7.07,12.35,1.08,162.6,10.4



  8%|▊         | 25/327 [00:30<08:15,  1.64s/it]

2005-08-22 21:10:00,1493,5.95,8.17,0.91,180.4,6



  8%|▊         | 26/327 [00:32<08:16,  1.65s/it]

2006-06-21 14:30:00,1496,10.44,12.64,1.26,30.8,8.3



  9%|▊         | 28/327 [00:33<06:12,  1.25s/it]

2005-08-24 11:00:00,1570,9.8,13.2,,24,8.8

2005-08-22 22:10:00,1578,7.61,10.78,1.07,113.7,7



  9%|▉         | 29/327 [00:35<07:10,  1.45s/it]

2005-08-22 22:10:00,1590,10.94,12.32,0.47,0,0



  9%|▉         | 30/327 [00:37<07:15,  1.46s/it]

2005-08-22 22:10:00,1596,2.57,5.42,1.29,150.3,13.1



  9%|▉         | 31/327 [00:39<06:17,  1.28s/it]


KeyboardInterrupt: 

In [90]:
convertTimiF_Combined10min_toFeather()

In [22]:
timi_df = pd.read_feather('D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/combined_10min/combined_10min' + '-timi-' + date.today().strftime("%Y-%m-%d") + '.feather')

In [6]:
df = pd.read_feather('D:/Skoli/Mastersverkefni/lokaverkefni_vel/data/Measured/combined_10min_20ms_25_3_24.feather')

In [10]:
yr = df.timi.dt.year

In [20]:
mt = df[df.timi.dt.year == 2005].timi.dt.month
# September 2005 - September 2023
for i in range(1, 13):
    print(i, sum(mt == i))

1 742
2 554
3 372
4 151
5 76
6 44
7 129
8 1607
9 5268
10 5338
11 5391
12 8276


In [21]:
mt = df[df.timi.dt.year == 2006].timi.dt.month

for i in range(1, 13):
    print(i, sum(mt == i))

1 10757
2 6363
3 5004
4 4661
5 1815
6 877
7 674
8 517
9 1951
10 5719
11 25330
12 16186


In [22]:
mt = df[df.timi.dt.year == 2007].timi.dt.month

for i in range(1, 13):
    print(i, sum(mt == i))

1 5688
2 6352
3 16162
4 5649
5 1249
6 1470
7 156
8 855
9 8929
10 7632
11 16048
12 23608


In [23]:
mt = df[df.timi.dt.year == 2023].timi.dt.month

for i in range(1, 13):
    print(i, sum(mt == i))

1 18990
2 24849
3 11414
4 4310
5 7748
6 641
7 631
8 218
9 8035
10 5224
11 0
12 0


In [19]:
from matplotlib import pyplot as plt
import numpy as np

# Birta þetta 2006-2023 sem sú

a = np.bincount(yr)

for i in range(1999, 2024):
    print(i, sum(yr == i)) 

1999 1
2000 0
2001 0
2002 0
2003 0
2004 4426
2005 27948
2006 79854
2007 93798
2008 94950
2009 70620
2010 60451
2011 93325
2012 102673
2013 98368
2014 119997
2015 129890
2016 83973
2017 81094
2018 114891
2019 105737
2020 184632
2021 87488
2022 143982
2023 82060


In [15]:
sum(yr < 2004)

1

In [13]:
len(yr)

1860158