In [1]:
import numpy as np
import pandas as pd
import re
import datetime 

from pathlib import Path

DATA_PATH = Path("__file__").absolute().parent.parent / 'artur 2' / 'data'

In [2]:
# Reading data and dropping usles or almost empty cols
df = pd.read_csv(DATA_PATH / 'lol.csv', low_memory=False).drop(['Po', 'Pa', 'ff10', 'ff3', 'N', 'W1', 'W2', 'Tn', 'Tx', 'Cl', 'Nh', 'Cm', 'Ch', 'VV', 'RRR', 'tR', 'E', 'Tg', 'E\'', 'sss', 'Td'], axis=1)

In [3]:
# reverse the df to go from oldes to the newest
df = df.iloc[::-1]

<h2>Relevant Cols</h2>
- T is temperature in Celsius<br>
- P is atmospheric pressure<br>
- U is relative humidity (%) at a hight of 2 meters<br>
- DD is a mean wind direction<br>
- Ff is a mean wind speed<br>
- WW is a weather phenomena<br>
- H is the clouds hight<br>

In [4]:
df.head(10)

Unnamed: 0,date,T,P,U,DD,Ff,WW,H
2255,25.01.2021 23:00,1.5,757.3,85,Wind blowing from the west,2.0,,600-1000
2254,26.01.2021 00:00,1.8,757.6,85,Wind blowing from the west-northwest,5.0,,300-600
2253,26.01.2021 01:00,2.0,757.7,87,Wind blowing from the west-northwest,5.0,,1000-1500
2252,26.01.2021 02:00,1.1,758.2,89,Wind blowing from the west,3.0,,"2500 or more, or no clouds."
2251,26.01.2021 03:00,0.9,758.2,90,Wind blowing from the north-west,2.0,,"2500 or more, or no clouds."
2250,26.01.2021 04:00,-1.3,758.5,93,Wind blowing from the north,1.0,,"2500 or more, or no clouds."
2249,26.01.2021 05:00,-1.8,758.7,94,Wind blowing from the north,2.0,,"2500 or more, or no clouds."
2248,26.01.2021 06:00,-2.2,758.9,96,Wind blowing from the north-east,1.0,,"2500 or more, or no clouds."
2247,26.01.2021 07:00,-2.5,759.0,96,,,,"2500 or more, or no clouds."
2246,26.01.2021 08:00,-1.9,758.9,95,Wind blowing from the south-west,1.0,,"2500 or more, or no clouds."


In [5]:
df.tail(10)

Unnamed: 0,date,T,P,U,DD,Ff,WW,H
9,29.04.2021 14:00,6.9,761.2,64,,,,600-1000
8,29.04.2021 15:00,7.8,761.2,59,,,,1000-1500
7,29.04.2021 16:00,6.4,761.2,74,,,Rain (not freezing).,600-1000
6,29.04.2021 17:00,7.7,761.1,64,,,Rain (not freezing).,600-1000
5,29.04.2021 18:00,6.9,761.4,64,,,,600-1000
4,29.04.2021 19:00,5.8,761.7,72,,,,600-1000
3,29.04.2021 20:00,5.9,762.1,67,,,,600-1000
2,29.04.2021 21:00,4.7,762.6,77,,,,600-1000
1,29.04.2021 22:00,4.0,763.0,84,,,"Rain shower(s) or intermittent rain, moderate.",300-600
0,29.04.2021 23:00,4.0,763.2,88,,,Rain (not freezing).,600-1000


In [6]:
# check the unique in DD col
df['DD'].value_counts()

Wind blowing from the south              285
Wind blowing from the north              206
Wind blowing from the north-northwest    201
Wind blowing from the north-west         198
Wind blowing from the south-southeast    165
Wind blowing from the east               142
Wind blowing from the south-southwest    125
Wind blowing from the west-northwest     117
Wind blowing from the west               110
Wind blowing from the north-northeast     91
Wind blowing from the south-west          90
Wind blowing from the east-southeast      68
Wind blowing from the south-east          61
Wind blowing from the east-northeast      54
Wind blowing from the west-southwest      53
Wind blowing from the north-east          47
Calm, no wind                             23
Name: DD, dtype: int64

In [7]:
# replace unique values and NaN values in a DD col
my_map = {
    None: 0,
    "Wind blowing from the south": 1,
    "Wind blowing from the west": 2,
    "Wind blowing from the south-southwest": 3,
    "Wind blowing from the north-west": 4,
    "Wind blowing from the north": 5,
    "Wind blowing from the south-southeast": 6,
    "Wind blowing from the west-northwest": 7,
    "Wind blowing from the south-west": 8,
    "Wind blowing from the north-northwest": 9,
    "Wind blowing from the west-southwest": 10,
    "Wind blowing from the south-east": 11,
    "Wind blowing from the east-southeast": 12,
    "Wind blowing from the east": 13,
    "Wind blowing from the north-northeast": 14,
    "Wind blowing from the north-east": 15,
    "Wind blowing from the east-northeast": 16,
    "Calm, no wind": 17,
}

df.replace(my_map, inplace=True)

df.head(10)

Unnamed: 0,date,T,P,U,DD,Ff,WW,H
2255,25.01.2021 23:00,1.5,757.3,85,2,2.0,,600-1000
2254,26.01.2021 00:00,1.8,757.6,85,7,5.0,,300-600
2253,26.01.2021 01:00,2.0,757.7,87,7,5.0,,1000-1500
2252,26.01.2021 02:00,1.1,758.2,89,2,3.0,,"2500 or more, or no clouds."
2251,26.01.2021 03:00,0.9,758.2,90,4,2.0,,"2500 or more, or no clouds."
2250,26.01.2021 04:00,-1.3,758.5,93,5,1.0,,"2500 or more, or no clouds."
2249,26.01.2021 05:00,-1.8,758.7,94,5,2.0,,"2500 or more, or no clouds."
2248,26.01.2021 06:00,-2.2,758.9,96,15,1.0,,"2500 or more, or no clouds."
2247,26.01.2021 07:00,-2.5,759.0,96,0,0.0,,"2500 or more, or no clouds."
2246,26.01.2021 08:00,-1.9,758.9,95,8,1.0,,"2500 or more, or no clouds."


In [8]:
# check the unique in H col
print(df['H'].value_counts())

2500 or more, or no clouds.    809
1000-1500                      516
600-1000                       284
300-600                        237
200-300                        163
1500-2000                      143
100-200                         69
2000-2500                       21
50-100                          11
Less than  50                    3
Name: H, dtype: int64


In [9]:
# replace unique values and NaN values in a H col
my_map = {
    "2500 or more, or no clouds.": 10,
    "1000-1500": 9,
    "300-600": 8,
    "1500-2000": 7,
    "600-1000": 6,
    "200-300": 5,
    "100-200": 4,
    "2000-2500": 3,
    "50-100": 2,
    "Less than  50": 1,
    "0": 0,
}

df.replace(my_map, inplace=True)

df.head(10)

Unnamed: 0,date,T,P,U,DD,Ff,WW,H
2255,25.01.2021 23:00,1.5,757.3,85,2,2.0,,6
2254,26.01.2021 00:00,1.8,757.6,85,7,5.0,,8
2253,26.01.2021 01:00,2.0,757.7,87,7,5.0,,9
2252,26.01.2021 02:00,1.1,758.2,89,2,3.0,,10
2251,26.01.2021 03:00,0.9,758.2,90,4,2.0,,10
2250,26.01.2021 04:00,-1.3,758.5,93,5,1.0,,10
2249,26.01.2021 05:00,-1.8,758.7,94,5,2.0,,10
2248,26.01.2021 06:00,-2.2,758.9,96,15,1.0,,10
2247,26.01.2021 07:00,-2.5,759.0,96,0,0.0,,10
2246,26.01.2021 08:00,-1.9,758.9,95,8,1.0,,10


In [10]:
print(df.isna().sum())

date    0
T       0
P       0
U       0
DD      0
Ff      0
WW      0
H       0
dtype: int64


Then we have to deal with col WW which represent some weather states. This is a bit more complicated because there is some analysis required. Bellow we can see all of the unique values. By examining them we can observe that there are some key words that repeat such as (rain, snow, fog, etc)

In [11]:
print(df['WW'].value_counts().index.to_list())

[' ', 'Rain (not freezing). ', 'Rain, not freezing, slight. ', 'Rain shower(s) or intermittent rain, slight. ', 'Snow ', 'Rain, not freezing, moderate. ', 'Mist. ', 'Snow shower(s) or intermittent snow, slight. ', 'Haze or smoke, or dust in suspension in the air, visibility equal to, or greater than, 1 km. ', 'Precipitation, slight or moderate. ', 'Drizzle, not freezing, slight. ', 'Precipitation ', 'Drizzle (not freezing) or snow grains. ', 'Drizzle and rain, slight. ', 'Snow, slight. ', 'Fog ', 'Snow, moderate. ', 'Snow shower(s) or intermittent snow, moderate. ', 'Snow grains ', 'Rain shower(s) or intermittent rain, moderate. ', 'Rain (or drizzle) and snow, slight. ', 'Ice pellets, slight. ', 'Freezing precipitation, slight or moderate. ', 'Ice pellets, moderate. ', 'Rain, not freezing, heavy. ', 'Drizzle and rain, moderate or heavy. ', 'Rain shower(s) or intermittent rain, heavy. ', 'Fog or ice fog, has begun or become thicker during the past hour. ']


In [12]:
# this was a testing code block that just shows all of the different subgroups I chose
# there is a problem that in some cases it can have both for example 'Shower(s) of rain and snow mixed, slight. '
# thus we need somehow deal with this problem
# the solution I have is devide col WW to 9 separete cols that will have bool value 1 or 0 for each of the states
rain = []
snow = []
fog = []
hail = []
drizzle = []
precipitation = []
ice_pellets = []
other = []

for st in df['WW'].value_counts().index.to_list():
    if 'rain' in st.lower():
        rain.append(st)
    elif 'snow' in st.lower():
        snow.append(st)
    elif 'fog' in st.lower():
        fog.append(st)
    elif 'hail' in st.lower():
        hail.append(st)
    elif 'drizzle' in st.lower():
        drizzle.append(st)
    elif 'precipitation' in st.lower():
        drizzle.append(st)
    elif 'ice pellets' in st.lower():
        ice_pellets.append(st)
    elif st != ' ':
        other.append(st)
    else:
        print(st)

 


In [13]:
def helper(row):
    """Matches re pattern in other words looks for some of the key words"""
    my_re = "(rain|snow|fog|hail|drizzle|precipitation|ice_pellets)"
    if not bool(re.search(my_re, row['WW'].lower())) and row['WW'] != ' ':
        return 1
    return 0 

# split the cols and drop the WW col
df['rain'] = df.apply(lambda row: 1 if 'rain' in row['WW'].lower() else 0, axis=1)
df['snow'] = df.apply(lambda row: 1 if 'snow' in row['WW'].lower() else 0, axis=1)
df['fog'] = df.apply(lambda row: 1 if 'fog' in row['WW'].lower() else 0, axis=1)
df['hail'] = df.apply(lambda row: 1 if 'hail' in row['WW'].lower() else 0, axis=1)
df['drizzle'] = df.apply(lambda row: 1 if 'drizzle' in row['WW'].lower() else 0, axis=1)
df['precipitation'] = df.apply(lambda row: 1 if 'precipitation' in row['WW'].lower() else 0, axis=1)
df['ice_pellets'] = df.apply(lambda row: 1 if 'ice pellets' in row['WW'].lower() else 0, axis=1)
df['other'] = df.apply(helper, axis=1)

df.drop(['WW'], axis=1, inplace=True)

In [14]:
# rename colnames to soething reasonable
names = {
    "T": "temperature",
    "P": "pressure",
    "U": "humidity",
    "DD": "wind_dir",
    "Ff": "wind_speed",
    "H": "coluds_hight",
}
df.rename(columns=names, inplace=True)

In [15]:
# # colapse the df to days and order them by year -> mont -> day
# df['date'] = pd.to_datetime(df['date'])
# df.index = df['date']
# df = df.loc[ df['date'].dt.year < 2021 ]
# df.drop(['date'], axis=1, inplace=True)

In [16]:
# df = df.groupby(by=[df.index.year, df.index.month, df.index.day]).mean()

df['rain'] = np.ceil(df['rain'])
df['snow'] = np.ceil(df['snow']) 
df['fog'] = np.ceil(df['fog']) 
df['hail'] = np.ceil(df['hail']) 
df['drizzle'] = np.ceil(df['drizzle']) 
df['precipitation'] = np.ceil(df['precipitation']) 
df['ice_pellets'] = np.ceil(df['ice_pellets']) 
df['rain'] = np.ceil(df['other'])
df['coluds_hight'] = np.round(df['coluds_hight'])
df['wind_dir'] = np.round(df['wind_dir'])


df = df.astype({"rain":'int', "snow":'int', "fog":'int', "hail":'int', "drizzle":'int', "precipitation":'int', "ice_pellets":'int'})

In [17]:
# (2009 // 2) + (2009 // 2)

In [18]:
# df = df.iloc[1314:]
# df

In [19]:
# we will forward fill the missing data scince it is most apropriate in this case
df.ffill(axis=0, inplace=True)

print(df.isna().sum())

date             0
temperature      0
pressure         0
humidity         0
wind_dir         0
wind_speed       0
coluds_hight     0
rain             0
snow             0
fog              0
hail             0
drizzle          0
precipitation    0
ice_pellets      0
other            0
dtype: int64


In [21]:
# save the result to new csv for later use
save_to = DATA_PATH / 'data_cleaned_lol.csv'

df.to_csv(save_to, date_format='%Y%m%d', index=False)