In [57]:
import pandas as pd

# Szeged (HU) weather history

In [69]:
szeged = pd.read_csv("../datasets/weatherHistory_original.csv")
szeged.info()
szeged.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  float64
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  float64
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 8.8+ MB


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


Drop useless columns

In [70]:
szeged.drop('Summary', axis=1, inplace=True)
szeged.drop('Wind Bearing (degrees)', axis=1, inplace=True)
szeged.drop('Visibility (km)', axis=1, inplace=True)
szeged.drop('Loud Cover', axis=1, inplace=True)
szeged.drop('Daily Summary', axis=1, inplace=True)

Convert _Formatted Date_ into
a column containing the day of the month (1st, 2nd, 3rd...),
one containing the month
and one containing the hour of the day (measurements are taken once per hour)

In [71]:
szeged['Formatted Date'] = pd.to_datetime(szeged['Formatted Date'])
months, days, hours = [], [], []
for i in range(len(szeged)):
    months.append(szeged['Formatted Date'][i].date().month)
    days.append(szeged['Formatted Date'][i].date().day)
    hours.append(szeged['Formatted Date'][i].time().hour)
szeged['Month'] = months
szeged['Day'] = days
szeged['Hour'] = hours
szeged.drop('Formatted Date', axis=1, inplace=True)
del months, days, hours

Transform the field _Precip Type_ into 2 binary columns: one for rain and one for snow

In [72]:
precip_types = pd.get_dummies(szeged['Precip Type'], prefix='pt', dummy_na=True)
szeged.drop('Precip Type', axis=1, inplace=True)
szeged['Rain'] = precip_types['pt_rain']
szeged['Snow'] = precip_types['pt_snow']
del precip_types
szeged.info()
szeged.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Temperature (C)           96453 non-null  float64
 1   Apparent Temperature (C)  96453 non-null  float64
 2   Humidity                  96453 non-null  float64
 3   Wind Speed (km/h)         96453 non-null  float64
 4   Pressure (millibars)      96453 non-null  float64
 5   Month                     96453 non-null  int64  
 6   Day                       96453 non-null  int64  
 7   Hour                      96453 non-null  int64  
 8   Rain                      96453 non-null  uint8  
 9   Snow                      96453 non-null  uint8  
dtypes: float64(5), int64(3), uint8(2)
memory usage: 6.1 MB


Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Pressure (millibars),Month,Day,Hour,Rain,Snow
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,1003.235956,6.523799,15.72708,11.500327,0.883581,0.111059
std,9.551546,10.696847,0.195473,6.913571,116.969906,3.448495,8.80216,6.922081,0.320729,0.314207
min,-21.822222,-27.716667,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,1011.9,4.0,8.0,6.0,1.0,0.0
50%,12.0,12.0,0.78,9.9659,1016.45,7.0,16.0,12.0,1.0,0.0
75%,18.838889,18.838889,0.89,14.1358,1021.09,10.0,23.0,18.0,1.0,0.0
max,39.905556,39.344444,1.0,63.8526,1046.38,12.0,31.0,23.0,1.0,1.0


Normalize columns (except of target column _Temperature (C)_) and round every value to the 4th decimal

In [73]:
temps = szeged.pop('Temperature (C)')

In [74]:
from sklearn.preprocessing import normalize
data = normalize(szeged, axis=0, norm='max')

In [75]:
szeged_norm = pd.DataFrame(data)
szeged_norm['Temperature (C)'] = temps
szeged_norm = szeged_norm.round(decimals=4)
szeged_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Temperature (C)
0,0.1878,0.89,0.2211,0.9701,0.3333,0.0323,0.0,1.0,0.0,9.4722
1,0.1837,0.86,0.2234,0.9706,0.3333,0.0323,0.0435,1.0,0.0,9.3556
2,0.2384,0.89,0.0615,0.9709,0.3333,0.0323,0.087,1.0,0.0,9.3778
3,0.1511,0.83,0.2209,0.9714,0.3333,0.0323,0.1304,1.0,0.0,8.2889
4,0.1774,0.83,0.173,0.9715,0.3333,0.0323,0.1739,1.0,0.0,8.7556


## Save modified dataset

In [76]:
szeged_norm.to_csv('../datasets/weatherHistory_preprocessed.csv')
szeged_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Temperature (C)
0,0.1878,0.89,0.2211,0.9701,0.3333,0.0323,0.0,1.0,0.0,9.4722
1,0.1837,0.86,0.2234,0.9706,0.3333,0.0323,0.0435,1.0,0.0,9.3556
2,0.2384,0.89,0.0615,0.9709,0.3333,0.0323,0.087,1.0,0.0,9.3778
3,0.1511,0.83,0.2209,0.9714,0.3333,0.0323,0.1304,1.0,0.0,8.2889
4,0.1774,0.83,0.173,0.9715,0.3333,0.0323,0.1739,1.0,0.0,8.7556


### Create simple neural network to do a preliminary test

In [77]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(100, activation='sigmoid'),
    Dense(50, activation='sigmoid'),
    Dense(1, activation='linear'),
])

model.compile(optimizer='SGD', loss='mse')

In [67]:
temps = szeged_norm.pop('Temperature (C)')

In [68]:
model.fit(
    x=szeged_norm.to_numpy(copy=True),
    y=temps.to_numpy(copy=True),
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x22f06806580>