In [2]:
import numpy as np
import pandas as pd
import math
import statistics
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import calplot
import matplotlib.cm as cm

# **Weather Forecasting**

**Brief description of the dataset**\
This dataset provides data on weather from 1st January 2009 to 31st December 2016. This dataset contains 14 different features such as air temperature, atmospheric pressure, and humidity, collected every ten minutes.

**Project steps**\
The main aim of this project is to perform weather forecasting for the period January-December 2016.\
Following what we have seen during the lessons, a comprehensive pipeline should be devised, including:
1. Loading, converting and cleaning of the data (note: aggregate the data in order to have weekly frequency).
Divide your dataset into training and testing;
2. Exploring the dataset with descriptive statistics and frequency analysis. Use appropriate graphs to visualise
the data at hand;
3. Describing the time series patterns (visually and numerically);
4. Focusing on temperature (degC, dependent variable), build the most appropriate models to forecast the
data for the specified period. Evaluate the model performance using different metrics. Visualise the results with appropriate graphs.

For the different steps, comment on the main results and any relevant observation/finding you have noticed.

## **Point 1**
Loading, converting and cleaning of the data (note: aggregate the data in order to have weekly frequency).
Divide your dataset into training and testing.

### Load the data

In [3]:
weather_df = pd.read_csv('Weather_ts.csv', sep=',', parse_dates=['Date Time'],  dayfirst=True, index_col='Date Time')
weather_df

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-01-01 00:10:00,996.52,-8.02,265.40,-8.90,93.30,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
2009-01-01 00:20:00,996.57,-8.41,265.01,-9.28,93.40,3.23,3.02,0.21,1.89,3.03,1309.80,0.72,1.50,136.1
2009-01-01 00:30:00,996.53,-8.51,264.91,-9.31,93.90,3.21,3.01,0.20,1.88,3.02,1310.24,0.19,0.63,171.6
2009-01-01 00:40:00,996.51,-8.31,265.12,-9.07,94.20,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.50,198.0
2009-01-01 00:50:00,996.51,-8.27,265.15,-9.04,94.10,3.27,3.08,0.19,1.92,3.09,1309.00,0.32,0.63,214.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 23:10:00,1000.11,-3.93,269.23,-8.09,72.60,4.56,3.31,1.25,2.06,3.31,1292.41,0.56,1.00,202.6
2016-12-31 23:20:00,1000.07,-4.05,269.10,-8.13,73.10,4.52,3.30,1.22,2.06,3.30,1292.98,0.67,1.52,240.0
2016-12-31 23:30:00,999.93,-3.35,269.81,-8.06,69.71,4.77,3.32,1.44,2.07,3.32,1289.44,1.14,1.92,234.3
2016-12-31 23:40:00,999.82,-3.16,270.01,-8.21,67.91,4.84,3.28,1.55,2.05,3.28,1288.39,1.08,2.00,215.2


The columns are:
1.	Date Time		- Date-time reference 
2.	p (mbar)		- Pressure used to quantify internal pressure (in millibars) 
3.	T (degC)		- Temperature in Celsius
4.	Tpot (K)	    - Temperature in Kelvin
5.	Tdew (degC)		- Temperature in Celsius relative to humidity
6.	rh (%)		    - Relative Humidity 
7.	VPmax (mbar)	- Saturation vapor pressure
8.	VPact (mbar)	- Vapor pressure
9.	VPdef (mbar)	- Vapor pressure deficit 
10.	sh (g/kg)		- Specific humidity
11.	H2OC (mmol/mol)	- Water vapor concentration
12.	rho (g/m**3)	- Airtight 
13.	wv (m/s)		- Wind speed
14.	max. wv (m/s)	- Maximum wind speed
15.	wd (deg)		- Wind direction in degrees

### Study and clean the data

In [4]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 420550 entries, 2009-01-01 00:10:00 to 2016-12-31 23:50:00
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   p (mbar)         420550 non-null  float64
 1   T (degC)         420550 non-null  float64
 2   Tpot (K)         420550 non-null  float64
 3   Tdew (degC)      420550 non-null  float64
 4   rh (%)           420550 non-null  float64
 5   VPmax (mbar)     420550 non-null  float64
 6   VPact (mbar)     420550 non-null  float64
 7   VPdef (mbar)     420550 non-null  float64
 8   sh (g/kg)        420550 non-null  float64
 9   H2OC (mmol/mol)  420550 non-null  float64
 10  rho (g/m**3)     420550 non-null  float64
 11  wv (m/s)         420550 non-null  float64
 12  max. wv (m/s)    420550 non-null  float64
 13  wd (deg)         420550 non-null  float64
dtypes: float64(14)
memory usage: 48.1 MB


There are no null values in any of the columns, so I don't have to deal with them. All the columns are numeric, so I can use the function .describe() to get some further information about them.

In [5]:
weather_df.describe()

Unnamed: 0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
count,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0,420550.0
mean,989.212751,9.450181,283.492779,4.955886,76.00826,13.576273,9.533771,4.042419,6.022418,9.640238,1216.062557,1.702225,3.056558,174.743714
std,8.358475,8.423346,8.504449,6.730651,16.476195,7.739016,4.184158,4.896855,2.656135,4.235388,39.975064,65.446792,69.017014,86.681794
min,913.6,-23.01,250.6,-25.01,12.95,0.95,0.79,0.0,0.5,0.8,1059.45,-9999.0,-9999.0,0.0
25%,984.2,3.36,277.43,0.24,65.21,7.78,6.21,0.87,3.92,6.29,1187.49,0.99,1.76,124.9
50%,989.58,9.42,283.47,5.22,79.3,11.82,8.86,2.19,5.59,8.96,1213.79,1.76,2.96,198.1
75%,994.72,15.47,289.53,10.07,89.4,17.6,12.35,5.3,7.8,12.49,1242.77,2.86,4.74,234.1
max,1015.35,37.28,311.34,23.11,100.0,63.77,28.32,46.01,18.13,28.82,1393.54,28.49,23.5,360.0


In [6]:
weather_df.shape

(420550, 14)

### Weekly aggregation of the data

The dataframe has daily frequency, but I aggregate it to have a weekly frequency.

In [7]:
weather_df_weekly = weather_df.resample('W').mean()
weather_df_weekly.head()

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-01-04,996.446696,-4.29207,269.146087,-6.185148,87.255409,4.511496,3.918348,0.593009,2.451287,3.934887,1289.293165,1.293252,2.310504,177.589652
2009-01-11,999.146161,-11.057847,262.168393,-13.171438,84.714841,2.831081,2.369058,0.462044,1.477421,2.372718,1327.335734,1.312153,2.256984,158.967391
2009-01-18,991.30876,-1.717867,272.126657,-4.664692,81.310595,5.561796,4.483313,1.07872,2.820536,4.526012,1270.352133,1.879524,2.970417,178.882599
2009-01-25,970.486319,1.151438,276.68122,-1.678264,82.035218,6.752004,5.444335,1.307619,3.499306,5.61372,1230.040417,2.553998,3.973105,188.601022
2009-02-01,990.295337,-2.464573,271.454127,-3.848323,90.406944,5.104276,4.617619,0.486429,2.905952,4.66375,1272.179296,1.981885,3.170645,46.615129


I used the .mean() function to aggregate due to the fact that the dataframe is made up of continuous numerical values, so it makes sense.

In [8]:
weather_df_weekly.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 418 entries, 2009-01-04 to 2017-01-01
Freq: W-SUN
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   p (mbar)         418 non-null    float64
 1   T (degC)         418 non-null    float64
 2   Tpot (K)         418 non-null    float64
 3   Tdew (degC)      418 non-null    float64
 4   rh (%)           418 non-null    float64
 5   VPmax (mbar)     418 non-null    float64
 6   VPact (mbar)     418 non-null    float64
 7   VPdef (mbar)     418 non-null    float64
 8   sh (g/kg)        418 non-null    float64
 9   H2OC (mmol/mol)  418 non-null    float64
 10  rho (g/m**3)     418 non-null    float64
 11  wv (m/s)         418 non-null    float64
 12  max. wv (m/s)    418 non-null    float64
 13  wd (deg)         418 non-null    float64
dtypes: float64(14)
memory usage: 49.0 KB


In [9]:
weather_df_weekly.describe()

Unnamed: 0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,989.238368,9.42849,283.468976,4.94322,76.047001,13.557678,9.526417,4.031179,6.017641,9.632617,1216.192035,1.701207,3.054848,174.800532
std,6.217979,7.431565,7.454327,6.135823,8.98143,6.400325,3.770776,3.033723,2.38949,3.811023,34.602324,8.760265,9.746778,35.462175
min,967.816796,-12.64,259.938155,-15.437232,54.132083,2.419841,1.911012,0.202073,1.180605,1.89625,1151.149742,-176.614435,-194.960387,46.615129
25%,985.893214,3.707639,277.995749,0.534705,69.596037,8.117391,6.436109,1.544459,4.040732,6.480258,1188.048755,1.73473,2.908113,153.013038
50%,989.290461,9.483542,283.648353,5.367331,76.297877,12.371086,9.073214,3.243462,5.703398,9.136746,1212.522202,2.021007,3.39812,182.85123
75%,992.865556,15.301004,289.378093,10.106562,83.067956,17.964365,12.528566,5.945732,7.91184,12.657299,1239.563063,2.411503,4.009271,201.698418
max,1007.889549,24.770526,298.443155,16.751081,95.171131,33.085079,19.144921,16.535179,12.155169,19.398016,1347.082302,4.619732,7.043562,240.663601


In [10]:
weather_df_weekly.shape

(418, 14)

### Train and test split

The main aim of this project is to perform weather forecasting for the period January-December 2016, so I split the datafram accordingly

In [11]:
split = '2016-01-01'
train = weather_df_weekly[weather_df_weekly.index < split]
test = weather_df_weekly[weather_df_weekly.index >= split]

In [12]:
test.tail()

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-12-04,999.956121,-0.467679,272.701944,-3.387222,81.552579,6.156409,4.900109,1.256071,3.058046,4.906677,1275.45372,1.437123,2.741706,190.730853
2016-12-11,1000.134891,1.667212,274.823413,-1.334246,81.225665,7.381587,5.828373,1.553492,3.637927,5.834554,1265.589107,1.437609,2.601359,182.925089
2016-12-18,1002.300288,2.645923,275.628373,1.012192,89.1638,7.50745,6.704841,0.80249,4.174643,6.694365,1262.940823,1.418085,2.399752,185.361032
2016-12-25,1000.907817,1.969593,275.060228,-0.343909,85.042927,7.248353,6.080327,1.168135,3.78871,6.076379,1264.651042,1.912788,3.262143,184.388889
2017-01-01,1007.889549,2.68956,275.233866,-0.238472,81.632986,7.702361,6.20375,1.498484,3.838715,6.15625,1270.206458,1.97228,3.63213,222.820949


The test dataframe has one row with value for the year 2017, I keep it anyway cause it gives me information about the last week of 2016.