# ASSIGNMENT 1: MEAN WIND SPEED AND THE WEIBULL DISTRIBUTION

## Imports

In [71]:
import pandas as pd
import numpy as np

## Read Data

In [72]:
headers = ['timestamp', 'wind_speed', 'wind_direction_67.5m', 'wind_direction_70m']

In [73]:
sprog = pd.read_csv("sprog.tsv", sep='\t', header=None, names=headers)

In [74]:
sprog

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m
0,197709131435,9.76,317.0,999.0
1,197709131445,10.58,309.0,999.0
2,197709131455,10.19,329.0,999.0
3,197709131505,8.71,323.0,999.0
4,197709131515,8.85,309.0,999.0
...,...,...,...,...
1156272,199909080635,4.20,999.0,222.0
1156273,199909080645,4.77,999.0,226.0
1156274,199909080655,5.37,999.0,233.0
1156275,199909080705,5.07,999.0,230.0


In [75]:
headers_2 = ['timestamp', 'wind_speed']

In [76]:
hovsore_1 = pd.read_csv('hovsore_1.txt', header=None, names= headers_2)

In [77]:
hovsore_1

Unnamed: 0,timestamp,wind_speed
0,201603012000,9.49
1,201603012000,10.16
2,201603012000,9.71
3,201603012000,10.79
4,201603012000,10.84
...,...,...
287995,201603012350,12.60
287996,201603012350,13.54
287997,201603012350,13.00
287998,201603012350,12.90


## Data Cleaning

Both datasets will be cleaned to remove all error measurements

### Sprog

First we convert all non-numerical values to NaN, those non-numerical values must correspond to error flags.

In [78]:
sprog = sprog.apply(pd.to_numeric, errors='coerce')

In [79]:
sprog

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m
0,197709131435,9.76,317.0,999.0
1,197709131445,10.58,309.0,999.0
2,197709131455,10.19,329.0,999.0
3,197709131505,8.71,323.0,999.0
4,197709131515,8.85,309.0,999.0
...,...,...,...,...
1156272,199909080635,4.20,999.0,222.0
1156273,199909080645,4.77,999.0,226.0
1156274,199909080655,5.37,999.0,233.0
1156275,199909080705,5.07,999.0,230.0


In [80]:
srpog = sprog.dropna()

In [81]:
sprog

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m
0,197709131435,9.76,317.0,999.0
1,197709131445,10.58,309.0,999.0
2,197709131455,10.19,329.0,999.0
3,197709131505,8.71,323.0,999.0
4,197709131515,8.85,309.0,999.0
...,...,...,...,...
1156272,199909080635,4.20,999.0,222.0
1156273,199909080645,4.77,999.0,226.0
1156274,199909080655,5.37,999.0,233.0
1156275,199909080705,5.07,999.0,230.0


#### Wind Direction

Apart from the non-numerical values, the other wind direction values that shouldn't be correct data are the ones over 360 or below 0. This could come from an error measurement (999 in this case) or the wind direction measurement is not normalised at 0 - 360 and getting multiples of those values. Let's find those values over 360 and below 0.

In [82]:
unique_below_0_wind_67_5 = sprog['wind_direction_67.5m'][sprog['wind_direction_67.5m'] < 0].unique()
unique_below_0_wind_70 = sprog['wind_direction_70m'][sprog['wind_direction_70m'] < 0].unique()

unique_over_360_wind_67_5 = sprog['wind_direction_67.5m'][sprog['wind_direction_67.5m'] > 360].unique()
unique_over_360_wind_70 = sprog['wind_direction_70m'][sprog['wind_direction_70m'] > 360].unique()

unique_values_over_360_and_below_0 = set(unique_below_0_wind_67_5).union(set(unique_below_0_wind_70), 
                                                                        set(unique_over_360_wind_67_5), 
                                                                        set(unique_over_360_wind_70))


In [83]:
unique_values_over_360_and_below_0

{999.0}

It turns out that the measurements are already normalised to a range of 0 to 360 degrees, however we find 999 measurements which are classified as error. Thise values are converter no NaN. 

In [51]:
sprog.replace(999, np.nan, inplace=True)

In [52]:
sprog

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m
0,197709131435,9.76,317.0,
1,197709131445,10.58,309.0,
2,197709131455,10.19,329.0,
3,197709131505,8.71,323.0,
4,197709131515,8.85,309.0,
...,...,...,...,...
1156272,199909080635,4.20,,222.0
1156273,199909080645,4.77,,226.0
1156274,199909080655,5.37,,233.0
1156275,199909080705,5.07,,230.0


As we want to work with just a wind direction value form now on, there is the possibility of using one of the two measurements of the wind direction or getting a mean out of it, let's look at the timestamps when we have two correct values in both height measurements. As there is just a difference of 2.5m in height. 

In [54]:
filtered_df = sprog.dropna(subset=['wind_direction_67.5m', 'wind_direction_70m'])

In [55]:
filtered_df

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m


It is seen that there are no timestamps when the two measurements (67.5 and 70 meters) have a proper measurement at the same time, thus a unique wind direction vector is created using one measurement or the other.

In [57]:
sprog['wind_direction'] = sprog['wind_direction_67.5m'].combine_first(sprog['wind_direction_70m'])

In [59]:
sprog

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m,wind_direction
0,197709131435,9.76,317.0,,317.0
1,197709131445,10.58,309.0,,309.0
2,197709131455,10.19,329.0,,329.0
3,197709131505,8.71,323.0,,323.0
4,197709131515,8.85,309.0,,309.0
...,...,...,...,...,...
1156272,199909080635,4.20,,222.0,222.0
1156273,199909080645,4.77,,226.0,226.0
1156274,199909080655,5.37,,233.0,233.0
1156275,199909080705,5.07,,230.0,230.0


In [68]:
sprog[sprog['wind_direction_67.5m'].isna() & sprog['wind_direction_70m'].notna()]

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m,wind_direction
595497,198901090005,9.74,,264.0,264.0
595498,198901090015,9.87,,261.0,261.0
595499,198901090025,9.13,,258.0,258.0
595500,198901090035,8.90,,260.0,260.0
595501,198901090045,9.06,,262.0,262.0
...,...,...,...,...,...
1156272,199909080635,4.20,,222.0,222.0
1156273,199909080645,4.77,,226.0,226.0
1156274,199909080655,5.37,,233.0,233.0
1156275,199909080705,5.07,,230.0,230.0


In [65]:
sprog[sprog['wind_direction_67.5m'].isna()]

Unnamed: 0,timestamp,wind_speed,wind_direction_67.5m,wind_direction_70m,wind_direction
490,197709170015,99.99,,,
491,197709170025,99.99,,,
492,197709170035,99.99,,,
493,197709170045,99.99,,,
494,197709170055,99.99,,,
...,...,...,...,...,...
1156272,199909080635,4.20,,222.0,222.0
1156273,199909080645,4.77,,226.0,226.0
1156274,199909080655,5.37,,233.0,233.0
1156275,199909080705,5.07,,230.0,230.0


#### Wind Speed

Now the wind speed measurements are checked. We'll look for cases where the wind speed measurement is incorrect. We set to find the values of wind_speed over 40 m/s and negative values.

In [93]:
unique_below_0_wind = sprog['wind_speed'][sprog['wind_speed'] < 0].unique()
unique_over_40_wind = sprog['wind_speed'][sprog['wind_speed'] > 40].unique()

unique_values_over_40_and_below_0 = set(unique_below_0_wind).union(set(unique_over_40_wind))

In [95]:
unique_values_over_40_and_below_0

{99.99}

Again the only invalid data is the one corresponding to 99.99 values, we'll convert those to NaN values.

In [96]:
sprog.replace(99.99, np.nan, inplace=True)

### Hovsore

This second datasate needs to be checked as well, we will apply the same filter as before, in this case there is only wind speed data available though.

In [102]:
hovsore_1 = hovsore_1.apply(pd.to_numeric, errors='coerce')

In [103]:
unique_below_0_wind = hovsore_1['wind_speed'][hovsore_1['wind_speed'] < 0].unique()
unique_over_40_wind = hovsore_1['wind_speed'][hovsore_1['wind_speed'] > 40].unique()

unique_values_over_40_and_below_0 = set(unique_below_0_wind).union(set(unique_over_40_wind))

In [104]:
unique_values_over_40_and_below_0

set()

No invalid data is found in the hovsore_1 dataset.

## Inspect the time-series

### 1)

Find the mean and standard deviation of wind speed from each data set.

#### Sprog

Mean Wind Speed

In [105]:
sprog['wind_speed'].mean()

8.235566499680502

Wind Speed Standard deviation

In [107]:
sprog['wind_speed'].std()

3.907913492060975

#### Hovsore 1

Mean Wind Speed

In [106]:
hovsore_1['wind_speed'].mean()

13.295274895833332

Wind Speed Standard deviation

In [108]:
hovsore_1['wind_speed'].std()

1.5435718228827797

### 2)

Appropriately calculate the mean and standard deviation of wind direction for the Sprogø
data, over the entire period.

### 3)