# Exploring the data for Buoy JAKI2 for the year 2005 - 2018.

#### Station JAKI2 - 63rd St., Chicago, IL

Owned and maintained by Chicago Park District  
41.781 N 87.573 W (41°46'50" N 87°34'23" W)

### Table of contents

Exploring the data:  
  
[2015](#2015),
[2016](#2016), 
[2017](#2017), 
[2018](#2018), 
[2019](#2019)  
[2015-2018](#2015-2018)

### Data

 - (ATMP) air temperature 
 - (WDIR) Wind Direction 
 - (WSPD) Wind Speed 
 - (GST) "Peak 5 or 8 second gust speed (m/s) measured during the eight-minute or two-minute period. The 5 or 8 second period can be determined by payload, See the Sensor Reporting, Sampling, and Accuracy section." 
 
[Data Dictionary](https://www.ndbc.noaa.gov/measdes.shtml)
 
#### Missing data
 - Lot's from 2015
 - [April 5-9, 2016](#april5-9,2016)
 - [January 27-31, 2018](#Jan27-31,2018)
 - [February 1-23, 25-28, 2018](#feb1-23,25-28,2-18)
 - [March 1-29, 2018](#mar1-29,2018)

In [1]:
import pandas as pd
import time

In [2]:
features = ['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR', 'WSPD', 'GST', 'ATMP', 'DEWP']

<a id='2015'></a>
### Exploring the data for 2015.

#### Missing Data
 - Only complete for September-December.

In [3]:
data_15 = pd.read_csv('../data/buoy_JAKI2/jaki2h2015.csv')
print(data_15.shape)
data_15.head()

(4157, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2015,3,23,20,0,79,8.5,99,99,99,99,999,9999,-0.6,999,999,99,99
2,2015,3,23,21,0,78,7.9,99,99,99,99,999,9999,-0.4,999,999,99,99
3,2015,3,23,22,0,82,7.3,99,99,99,99,999,9999,-0.2,999,999,99,99
4,2015,3,23,23,0,83,6.3,99,99,99,99,999,9999,0,999,999,99,99


In [4]:
data_15.rename(mapper={
    'YYYY': '#YY',
    'WD': 'WDIR',
    'WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_15.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2015,3,23,20,0,79,8.5,99,99,99,99,999,9999,-0.6,999,999,99,99
2,2015,3,23,21,0,78,7.9,99,99,99,99,999,9999,-0.4,999,999,99,99
3,2015,3,23,22,0,82,7.3,99,99,99,99,999,9999,-0.2,999,999,99,99
4,2015,3,23,23,0,83,6.3,99,99,99,99,999,9999,0,999,999,99,99


In [5]:
data_15.isnull().sum()

#YY     0
MM      0
DD      0
hh      0
mm      0
WDIR    0
WSPD    0
GST     0
WVHT    0
DPD     0
APD     0
MWD     0
PRES    0
ATMP    0
WTMP    0
DEWP    0
VIS     0
TIDE    0
dtype: int64

In [6]:
data_15.dtypes

#YY     object
MM      object
DD      object
hh      object
mm      object
WDIR    object
WSPD    object
GST     object
WVHT    object
DPD     object
APD     object
MWD     object
PRES    object
ATMP    object
WTMP    object
DEWP    object
VIS     object
TIDE    object
dtype: object

In [7]:
data_15 = data_15[features]
data_15.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,degC,degC
1,2015,3,23,20,0,79,8.5,99,-0.6,999
2,2015,3,23,21,0,78,7.9,99,-0.4,999
3,2015,3,23,22,0,82,7.3,99,-0.2,999
4,2015,3,23,23,0,83,6.3,99,0,999


In [8]:
data_15 = data_15.drop(0, axis=0)
data_15.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2015,3,23,20,0,79,8.5,99,-0.6,999
2,2015,3,23,21,0,78,7.9,99,-0.4,999
3,2015,3,23,22,0,82,7.3,99,-0.2,999
4,2015,3,23,23,0,83,6.3,99,0.0,999
5,2015,3,24,0,0,78,7.7,99,0.1,999


In [9]:
data_15.reset_index(inplace=True)
data_15.head()

Unnamed: 0,index,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,1,2015,3,23,20,0,79,8.5,99,-0.6,999
1,2,2015,3,23,21,0,78,7.9,99,-0.4,999
2,3,2015,3,23,22,0,82,7.3,99,-0.2,999
3,4,2015,3,23,23,0,83,6.3,99,0.0,999
4,5,2015,3,24,0,0,78,7.7,99,0.1,999


In [10]:
data_15 = data_15.drop(columns='index')
data_15.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,3,23,20,0,79,8.5,99,-0.6,999
1,2015,3,23,21,0,78,7.9,99,-0.4,999
2,2015,3,23,22,0,82,7.3,99,-0.2,999
3,2015,3,23,23,0,83,6.3,99,0.0,999
4,2015,3,24,0,0,78,7.7,99,0.1,999


In [11]:
data_15['WSPD'] = data_15['WSPD'].astype('float')
data_15.dtypes

#YY      object
MM       object
DD       object
hh       object
mm       object
WDIR     object
WSPD    float64
GST      object
ATMP     object
DEWP     object
dtype: object

In [12]:
data_15['ATMP'] = data_15['ATMP'].astype('float')
data_15.dtypes

#YY      object
MM       object
DD       object
hh       object
mm       object
WDIR     object
WSPD    float64
GST      object
ATMP    float64
DEWP     object
dtype: object

In [13]:
int_features = ['#YY', 'MM', 'DD', 'hh', 'mm', 'GST', 'DEWP']

In [14]:
data_15[int_features] = data_15[int_features].astype('int')
data_15.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object

In [15]:
data_15['WDIR'].astype('int')

ValueError: invalid literal for int() with base 10: '27 1'

### Clean the 'WDIR' feature

In [16]:
start_time = time.time()

wdir = data_15['WDIR']
data0 = []
j = 0 
for i in range(len(wdir)):
    if i % 1000 == 0:
        j += i
        print(f'{100 - 100 * i/ len(wdir)}% left until complete')
    
    if len(wdir.iloc[i].split()) == 1:
        pass
    else:
        data0 = wdir.iloc[i]
        data1 = data0.split()
        wdir.iloc[i] = data1[0]
        
end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)  

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

100.0% left until complete


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


75.93840230991339% left until complete
51.87680461982676% left until complete
27.81520692974013% left until complete
3.753609239653514% left until complete
time: 3.815 seconds
time: 0 minutes, 3.815 seconds


In [17]:
data_15['WDIR'] = wdir
data_15['WDIR'].head()

0    79
1    78
2    82
3    83
4    78
Name: WDIR, dtype: object

In [18]:
for i in range(len(data_15['WDIR'])):
    if len(data_15['WDIR'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_15['WDIR'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [19]:
data_15['WDIR'] = data_15['WDIR'].astype('int')

In [20]:
data_15.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object

In [21]:
# saving the data
data_15.to_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015.csv', index=False)

In [22]:
data_2015 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015.csv')
print(data_2015.shape)
print(data_2015.dtypes)
data_2015.head()

(4156, 10)
#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,3,23,20,0,79,8.5,99,-0.6,999
1,2015,3,23,21,0,78,7.9,99,-0.4,999
2,2015,3,23,22,0,82,7.3,99,-0.2,999
3,2015,3,23,23,0,83,6.3,99,0.0,999
4,2015,3,24,0,0,78,7.7,99,0.1,999


In [23]:
data_2015.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015,3,23,80.5,7.5,99.0,-0.3,999.0
2015,3,24,107.25,5.258333,99.0,0.2625,999.0
2015,3,25,218.782609,5.891304,99.0,4.995652,999.0
2015,3,26,232.173913,2.76087,99.0,3.734783,999.0
2015,3,27,51.318182,7.209091,99.0,-0.995455,999.0


In [24]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2015[data_2015['MM'] == i]['DD'].value_counts()))

1 0
2 0
3 9
4 25
5 0
6 0
7 0
8 26
9 30
10 31
11 30
12 31


<font color = blue>There is a fair amount of missing data in this year.  Let's see what else we can get from the next year.  But, this might not be the buoy to use.</font>

<a id='2016'></a>
### Exploring the data for 2016.

#### Missing data
 - [April 5-9, 2016](#april5-9,2016)

In [25]:
data_16 = pd.read_csv('../data/buoy_JAKI2/jaki2h2016.csv')
print(data_16.shape)
data_16.head()

(8436, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2016,1,1,0,0,256,6.9,99,99,99,99,999,9999,-2.7,999,999,99,99
2,2016,1,1,1,0,259,5.5,99,99,99,99,999,9999,-3.2,999,999,99,99
3,2016,1,1,2,0,269,4.9,99,99,99,99,999,9999,-3.4,999,999,99,99
4,2016,1,1,3,0,257,5.1,99,99,99,99,999,9999,-3.3,999,999,99,99


In [26]:
data_16.rename(mapper={
    'YYYY': '#YY',
    'WD': 'WDIR',
    'WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_16.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2016,1,1,0,0,256,6.9,99,99,99,99,999,9999,-2.7,999,999,99,99
2,2016,1,1,1,0,259,5.5,99,99,99,99,999,9999,-3.2,999,999,99,99
3,2016,1,1,2,0,269,4.9,99,99,99,99,999,9999,-3.4,999,999,99,99
4,2016,1,1,3,0,257,5.1,99,99,99,99,999,9999,-3.3,999,999,99,99


In [27]:
data_16.dtypes

#YY     object
MM      object
DD      object
hh      object
mm      object
WDIR    object
WSPD    object
GST     object
WVHT    object
DPD     object
APD     object
MWD     object
PRES    object
ATMP    object
WTMP    object
DEWP    object
VIS     object
TIDE    object
dtype: object

In [28]:
data_16 = data_16[features]
data_16.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,degC,degC
1,2016,1,1,0,0,256,6.9,99,-2.7,999
2,2016,1,1,1,0,259,5.5,99,-3.2,999
3,2016,1,1,2,0,269,4.9,99,-3.4,999
4,2016,1,1,3,0,257,5.1,99,-3.3,999


In [29]:
data_16 = data_16.drop(0, axis=0)
data_16.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2016,1,1,0,0,256,6.9,99,-2.7,999
2,2016,1,1,1,0,259,5.5,99,-3.2,999
3,2016,1,1,2,0,269,4.9,99,-3.4,999
4,2016,1,1,3,0,257,5.1,99,-3.3,999
5,2016,1,1,4,0,254,3.8,99,-3.0,999


In [30]:
data_16.reset_index(inplace=True)
data_16.head()

Unnamed: 0,index,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,1,2016,1,1,0,0,256,6.9,99,-2.7,999
1,2,2016,1,1,1,0,259,5.5,99,-3.2,999
2,3,2016,1,1,2,0,269,4.9,99,-3.4,999
3,4,2016,1,1,3,0,257,5.1,99,-3.3,999
4,5,2016,1,1,4,0,254,3.8,99,-3.0,999


In [31]:
data_16 = data_16.drop(columns='index')
data_16.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2016,1,1,0,0,256,6.9,99,-2.7,999
1,2016,1,1,1,0,259,5.5,99,-3.2,999
2,2016,1,1,2,0,269,4.9,99,-3.4,999
3,2016,1,1,3,0,257,5.1,99,-3.3,999
4,2016,1,1,4,0,254,3.8,99,-3.0,999


In [32]:
data_16[['WSPD', 'ATMP']] = data_16[['WSPD', 'ATMP']].astype('float')
data_16.dtypes

#YY      object
MM       object
DD       object
hh       object
mm       object
WDIR     object
WSPD    float64
GST      object
ATMP    float64
DEWP     object
dtype: object

In [33]:
int_features = ['#YY', 'MM', 'DD', 'hh', 'mm', 'GST', 'DEWP']

In [34]:
data_16[int_features] = data_16[int_features].astype('int')
data_16.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object

In [35]:
data_16['WDIR'].astype('int')

ValueError: invalid literal for int() with base 10: '41 1'

### Clean the 'WDIR' feature

In [36]:
start_time = time.time()

wdir = data_16['WDIR']
data0 = []
j = 0 
for i in range(len(wdir)):
    if i % 1000 == 0:
        j += i
        print(f'{100 - 100 * i/ len(wdir)}% left until complete')
    
    if len(wdir.iloc[i].split()) == 1:
        pass
    else:
        data0 = wdir.iloc[i]
        data1 = data0.split()
        wdir.iloc[i] = data1[0]
        
end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)  

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

100.0% left until complete
88.14463544754001% left until complete
76.28927089508002% left until complete
64.43390634262003% left until complete
52.578541790160045% left until complete
40.72317723770006% left until complete
28.867812685240068% left until complete
17.01244813278008% left until complete
5.157083580320091% left until complete
time: 2.611 seconds
time: 0 minutes, 2.611 seconds


In [37]:
data_16['WDIR'] = wdir
data_16['WDIR'].head()

0    256
1    259
2    269
3    257
4    254
Name: WDIR, dtype: object

In [38]:
for i in range(len(data_16['WDIR'])):
    if len(data_16['WDIR'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_16['WDIR'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [39]:
data_16['WDIR'] = data_16['WDIR'].astype('int')
data_16.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object

In [40]:
# saving the data
data_16.to_csv('../data/buoy_JAKI2/jaki2_buoy_data_2016.csv', index=False)

In [41]:
data_2016 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2016.csv')
print(data_2016.shape)
print(data_2016.dtypes)
data_2016.head()

(8435, 10)
#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2016,1,1,0,0,256,6.9,99,-2.7,999
1,2016,1,1,1,0,259,5.5,99,-3.2,999
2,2016,1,1,2,0,269,4.9,99,-3.4,999
3,2016,1,1,3,0,257,5.1,99,-3.3,999
4,2016,1,1,4,0,254,3.8,99,-3.0,999


In [42]:
data_2016.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016,1,1,263.708333,6.058333,99.0,-3.358333,999.0
2016,1,2,261.73913,4.456522,99.0,-1.343478,999.0
2016,1,3,307.083333,4.275,99.0,-1.108333,999.0
2016,1,4,226.791667,4.716667,99.0,-1.329167,999.0
2016,1,5,179.625,3.775,99.0,-2.1375,999.0


In [43]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2016[data_2016['MM'] == i]['DD'].value_counts()))

1 31
2 29
3 31
4 25
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 31


<a id='april5-9,2016'></a>

In [44]:
# missing data, April 5-9, 2016
data_2016[data_2016['MM'] == 4].groupby('DD').mean()

Unnamed: 0_level_0,#YY,MM,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
DD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2016.0,4.0,11.5,0.0,156.583333,3.620833,99.0,6.708333,999.0
2,2016.0,4.0,11.5,0.0,287.083333,5.475,99.0,2.6625,999.0
3,2016.0,4.0,11.5,0.0,251.916667,4.625,99.0,6.445833,999.0
4,2016.0,4.0,6.5,0.0,89.285714,6.321429,99.0,7.228571,999.0
10,2016.0,4.0,13.631579,0.0,194.315789,5.478947,99.0,4.994737,999.0
11,2016.0,4.0,11.5,0.0,272.75,3.591667,99.0,10.033333,999.0
12,2016.0,4.0,11.5,0.0,129.833333,3.629167,99.0,4.295833,999.0
13,2016.0,4.0,11.5,0.0,113.625,3.7375,99.0,5.9,999.0
14,2016.0,4.0,11.5,0.0,96.25,3.05,99.0,7.279167,999.0
15,2016.0,4.0,11.391304,0.0,70.304348,3.03913,99.0,7.991304,999.0


<a id='2017'></a>
### Exploring the data for 2017.

In [45]:
data_17 = pd.read_csv('../data/buoy_JAKI2/jaki2h2017.csv')
print(data_17.shape)
data_17.head()

(8621, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2017,1,1,0,0,282,3.5,99,99,99,99,999,9999,0.7,999,999,99,99
2,2017,1,1,1,0,267,1.9,99,99,99,99,999,9999,0.2,999,999,99,99
3,2017,1,1,2,0,266,1.9,99,99,99,99,999,9999,-0.2,999,999,99,99
4,2017,1,1,3,0,251,3.6,99,99,99,99,999,9999,-0.7,999,999,99,99


In [46]:
data_17.rename(mapper={
    'YYYY': '#YY',
    'WD': 'WDIR',
    'WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_17.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2017,1,1,0,0,282,3.5,99,99,99,99,999,9999,0.7,999,999,99,99
2,2017,1,1,1,0,267,1.9,99,99,99,99,999,9999,0.2,999,999,99,99
3,2017,1,1,2,0,266,1.9,99,99,99,99,999,9999,-0.2,999,999,99,99
4,2017,1,1,3,0,251,3.6,99,99,99,99,999,9999,-0.7,999,999,99,99


In [47]:
data_17.isnull().sum()

#YY     0
MM      0
DD      0
hh      0
mm      0
WDIR    0
WSPD    0
GST     0
WVHT    0
DPD     0
APD     0
MWD     0
PRES    0
ATMP    0
WTMP    0
DEWP    0
VIS     0
TIDE    0
dtype: int64

In [48]:
data_17.dtypes

#YY     object
MM      object
DD      object
hh      object
mm      object
WDIR    object
WSPD    object
GST     object
WVHT    object
DPD     object
APD     object
MWD     object
PRES    object
ATMP    object
WTMP    object
DEWP    object
VIS     object
TIDE    object
dtype: object

In [49]:
data_17 = data_17[features]
data_17.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,degC,degC
1,2017,1,1,0,0,282,3.5,99,0.7,999
2,2017,1,1,1,0,267,1.9,99,0.2,999
3,2017,1,1,2,0,266,1.9,99,-0.2,999
4,2017,1,1,3,0,251,3.6,99,-0.7,999


In [50]:
data_17 = data_17.drop(0, axis=0)
data_17.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2017,1,1,0,0,282,3.5,99,0.7,999
2,2017,1,1,1,0,267,1.9,99,0.2,999
3,2017,1,1,2,0,266,1.9,99,-0.2,999
4,2017,1,1,3,0,251,3.6,99,-0.7,999
5,2017,1,1,4,0,264,2.2,99,-1.3,999


In [51]:
data_17 = data_17.reset_index()
data_17.head()

Unnamed: 0,index,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,1,2017,1,1,0,0,282,3.5,99,0.7,999
1,2,2017,1,1,1,0,267,1.9,99,0.2,999
2,3,2017,1,1,2,0,266,1.9,99,-0.2,999
3,4,2017,1,1,3,0,251,3.6,99,-0.7,999
4,5,2017,1,1,4,0,264,2.2,99,-1.3,999


In [52]:
data_17 = data_17.drop(columns='index')
data_17.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2017,1,1,0,0,282,3.5,99,0.7,999
1,2017,1,1,1,0,267,1.9,99,0.2,999
2,2017,1,1,2,0,266,1.9,99,-0.2,999
3,2017,1,1,3,0,251,3.6,99,-0.7,999
4,2017,1,1,4,0,264,2.2,99,-1.3,999


In [53]:
data_17[['WSPD', 'ATMP']] = data_17[['WSPD', 'ATMP']].astype('float')
data_17.dtypes

#YY      object
MM       object
DD       object
hh       object
mm       object
WDIR     object
WSPD    float64
GST      object
ATMP    float64
DEWP     object
dtype: object

In [54]:
int_features = ['#YY', 'MM', 'DD', 'hh', 'mm', 'GST', 'DEWP']

data_17[int_features] = data_17[int_features].astype('int')
data_17.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object

In [55]:
data_17['WDIR'].astype('int')

ValueError: invalid literal for int() with base 10: '275 1'

### Clean the 'WDIR' feature

In [56]:
start_time = time.time()

wdir = data_17['WDIR']
data0 = []
j = 0 
for i in range(len(wdir)):
    if i % 1000 == 0:
        j += i
        print(f'{100 - 100 * i/ len(wdir)}% left until complete')
    
    if len(wdir.iloc[i].split()) == 1:
        pass
    else:
        data0 = wdir.iloc[i]
        data1 = data0.split()
        wdir.iloc[i] = data1[0]
        
end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)  

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

100.0% left until complete
88.39907192575406% left until complete
76.79814385150812% left until complete
65.19721577726219% left until complete
53.596287703016245% left until complete
41.9953596287703% left until complete
30.39443155452436% left until complete
18.793503480278417% left until complete
7.192575406032489% left until complete
time: 2.983 seconds
time: 0 minutes, 2.983 seconds


In [57]:
data_17['WDIR'] = wdir
data_17['WDIR'].head()

0    282
1    267
2    266
3    251
4    264
Name: WDIR, dtype: object

In [58]:
for i in range(len(data_17['WDIR'])):
    if len(data_17['WDIR'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_17['WDIR'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [59]:
data_17['WDIR'] = data_17['WDIR'].astype('int')
data_17.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object

In [60]:
# saving the data
data_17.to_csv('../data/buoy_JAKI2/jaki2_buoy_data_2017.csv', index=False)

In [61]:
data_2017 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2017.csv')
print(data_2017.shape)
print(data_2017.dtypes)
data_2017.head()

(8620, 10)
#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST       int64
ATMP    float64
DEWP      int64
dtype: object


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2017,1,1,0,0,282,3.5,99,0.7,999
1,2017,1,1,1,0,267,1.9,99,0.2,999
2,2017,1,1,2,0,266,1.9,99,-0.2,999
3,2017,1,1,3,0,251,3.6,99,-0.7,999
4,2017,1,1,4,0,264,2.2,99,-1.3,999


In [62]:
data_2017.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017,1,1,234.333333,2.391667,99.0,-0.408333,999.0
2017,1,2,77.666667,2.191667,99.0,2.133333,999.0
2017,1,3,131.708333,2.629167,99.0,3.466667,999.0
2017,1,4,285.041667,5.429167,99.0,-6.670833,999.0
2017,1,5,282.75,3.866667,99.0,-11.175,999.0


In [63]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2017[data_2017['MM'] == i]['DD'].value_counts()))

1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 31


<font color = blue>No Missing DATA!

<a id='2018'></a>
### Exploring the data for 2018.

#### Missing data
 - [January 27-31, 2018](#Jan27-31,2018)
 - [February 1-23, 25-28, 2018](#feb1-23,25-28,2-18)
 - [March 1-29, 2018](#mar1-29,2018)

In [64]:
data_18 = pd.read_csv('../data/buoy_JAKI2/jaki2h2018.csv')
print(data_18.shape)
data_18.head()

(7193, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2018,1,1,0,0,311,3.5,99,99,99,99,999,9999,-12,999,999,99,99
2,2018,1,1,1,0,329,4.7,99,99,99,99,999,9999,-13.2,999,999,99,99
3,2018,1,1,2,0,321,3.8,99,99,99,99,999,9999,-13.8,999,999,99,99
4,2018,1,1,3,0,316,2,99,99,99,99,999,9999,-14.6,999,999,99,99


In [65]:
data_18.rename(mapper={
    'YYYY': '#YY',
    'WD': 'WDIR',
    'WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_18.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,mi,ft
1,2018,1,1,0,0,311,3.5,99,99,99,99,999,9999,-12,999,999,99,99
2,2018,1,1,1,0,329,4.7,99,99,99,99,999,9999,-13.2,999,999,99,99
3,2018,1,1,2,0,321,3.8,99,99,99,99,999,9999,-13.8,999,999,99,99
4,2018,1,1,3,0,316,2,99,99,99,99,999,9999,-14.6,999,999,99,99


In [66]:
data_18.isnull().sum()

#YY     0
MM      0
DD      0
hh      0
mm      0
WDIR    0
WSPD    0
GST     0
WVHT    0
DPD     0
APD     0
MWD     0
PRES    0
ATMP    0
WTMP    0
DEWP    0
VIS     0
TIDE    0
dtype: int64

In [67]:
data_18.dtypes

#YY     object
MM      object
DD      object
hh      object
mm      object
WDIR    object
WSPD    object
GST     object
WVHT    object
DPD     object
APD     object
MWD     object
PRES    object
ATMP    object
WTMP    object
DEWP    object
VIS     object
TIDE    object
dtype: object

In [68]:
data_18 = data_18[features]
data_18.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,#yr,mo,dy,hr,mn,degT,m/s,m/s,degC,degC
1,2018,1,1,0,0,311,3.5,99,-12,999
2,2018,1,1,1,0,329,4.7,99,-13.2,999
3,2018,1,1,2,0,321,3.8,99,-13.8,999
4,2018,1,1,3,0,316,2,99,-14.6,999


In [69]:
data_18 = data_18.drop(0, axis=0)
data_18.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2018,1,1,0,0,311,3.5,99,-12.0,999
2,2018,1,1,1,0,329,4.7,99,-13.2,999
3,2018,1,1,2,0,321,3.8,99,-13.8,999
4,2018,1,1,3,0,316,2.0,99,-14.6,999
5,2018,1,1,4,0,293,2.8,99,-15.5,999


In [70]:
data_18.reset_index(inplace=True)
data_18.head()

Unnamed: 0,index,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,1,2018,1,1,0,0,311,3.5,99,-12.0,999
1,2,2018,1,1,1,0,329,4.7,99,-13.2,999
2,3,2018,1,1,2,0,321,3.8,99,-13.8,999
3,4,2018,1,1,3,0,316,2.0,99,-14.6,999
4,5,2018,1,1,4,0,293,2.8,99,-15.5,999


In [71]:
data_18 = data_18.drop(columns='index')
data_18.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2018,1,1,0,0,311,3.5,99,-12.0,999
1,2018,1,1,1,0,329,4.7,99,-13.2,999
2,2018,1,1,2,0,321,3.8,99,-13.8,999
3,2018,1,1,3,0,316,2.0,99,-14.6,999
4,2018,1,1,4,0,293,2.8,99,-15.5,999


In [72]:
data_18[['WSPD', 'ATMP']] = data_18[['WSPD', 'ATMP']].astype('float')
data_18.dtypes

#YY      object
MM       object
DD       object
hh       object
mm       object
WDIR     object
WSPD    float64
GST      object
ATMP    float64
DEWP     object
dtype: object

In [73]:
int_features = ['#YY', 'MM', 'DD', 'hh', 'mm', 'DEWP']

data_18[int_features] = data_18[int_features].astype('int')
data_18.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD    float64
GST      object
ATMP    float64
DEWP      int64
dtype: object

In [74]:
data_18['GST'] = data_18['GST'].astype('float')
data_18.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD    float64
GST     float64
ATMP    float64
DEWP      int64
dtype: object

In [75]:
data_18['WDIR'].astype('float')

ValueError: could not convert string to float: '17 1'

### Clean the 'WDIR' feature

In [76]:
start_time = time.time()

wdir = data_18['WDIR']
data0 = []
j = 0 
for i in range(len(wdir)):
    if i % 1000 == 0:
        j += i
        print(f'{100 - 100 * i/ len(wdir)}% left until complete')
    
    if len(wdir.iloc[i].split()) == 1:
        pass
    else:
        data0 = wdir.iloc[i]
        data1 = data0.split()
        wdir.iloc[i] = data1[0]
        
end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)  

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

100.0% left until complete
86.09566184649611% left until complete
72.19132369299221% left until complete
58.28698553948832% left until complete
44.38264738598443% left until complete
30.47830923248054% left until complete
16.57397107897664% left until complete
2.6696329254727544% left until complete
time: 2.537 seconds
time: 0 minutes, 2.537 seconds


In [77]:
data_18['WDIR'] = wdir
data_18['WDIR'].head()

0    311
1    329
2    321
3    316
4    293
Name: WDIR, dtype: object

In [78]:
for i in range(len(data_18['WDIR'])):
    if len(data_18['WDIR'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_18['WDIR'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [79]:
data_18['WDIR'] = data_18['WDIR'].astype('int')
data_18.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
ATMP    float64
DEWP      int64
dtype: object

In [80]:
# saving the data
data_18.to_csv('../data/buoy_JAKI2/jaki2_buoy_data_2018.csv', index=False)

In [81]:
data_2018 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2018.csv')
print(data_2018.shape)
print(data_2018.dtypes)
data_2018.head()

(7192, 10)
#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
ATMP    float64
DEWP      int64
dtype: object


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2018,1,1,0,0,311,3.5,99.0,-12.0,999
1,2018,1,1,1,0,329,4.7,99.0,-13.2,999
2,2018,1,1,2,0,321,3.8,99.0,-13.8,999
3,2018,1,1,3,0,316,2.0,99.0,-14.6,999
4,2018,1,1,4,0,293,2.8,99.0,-15.5,999


In [82]:
data_2018.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018,1,1,291.083333,3.8625,99.0,-16.879167,999.0
2018,1,2,259.666667,4.775,99.0,-17.991667,999.0
2018,1,3,270.333333,4.479167,99.0,-10.416667,999.0
2018,1,4,319.875,3.041667,99.0,-12.1,999.0
2018,1,5,315.666667,3.095833,99.0,-13.241667,999.0


In [83]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2018[data_2018['MM'] == i]['DD'].value_counts()))

1 26
2 1
3 2
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 31


<a id='Jan27-31,2018'></a>

In [84]:
# missing January 27-31, 2018
data_2018[data_2018['MM'] == 1].groupby('DD').mean()

Unnamed: 0_level_0,#YY,MM,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
DD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2018.0,1.0,11.5,0.0,291.083333,3.8625,99.0,-16.879167,999.0
2,2018.0,1.0,11.5,0.0,259.666667,4.775,99.0,-17.991667,999.0
3,2018.0,1.0,11.5,0.0,270.333333,4.479167,99.0,-10.416667,999.0
4,2018.0,1.0,11.5,0.0,319.875,3.041667,99.0,-12.1,999.0
5,2018.0,1.0,11.5,0.0,315.666667,3.095833,99.0,-13.241667,999.0
6,2018.0,1.0,11.5,0.0,283.083333,2.545833,99.0,-12.283333,999.0
7,2018.0,1.0,11.173913,0.0,213.173913,4.956522,99.0,-7.46087,999.0
8,2018.0,1.0,11.5,0.0,255.166667,5.245833,99.0,1.65,999.0
9,2018.0,1.0,11.5,0.0,227.041667,2.8875,99.0,-0.258333,999.0
10,2018.0,1.0,11.5,0.0,194.0,4.8625,99.0,4.266667,999.0


<a id='feb1-23,25-28,2-18'></a>

In [85]:
# missing February 1-23, 25-28, 2018
data_2018[data_2018['MM'] == 2].groupby('DD').mean()

Unnamed: 0_level_0,#YY,MM,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
DD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
24,2018,2,17,0,280,9.0,99.0,22.2,999


<a id='mar1-29,2018'></a>

In [86]:
# missing March 1-29, 2018
data_2018[data_2018['MM'] == 3].groupby('DD').mean()

Unnamed: 0_level_0,#YY,MM,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
DD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30,2018.0,3.0,17.636364,0.0,97.181818,3.263636,38.481818,2.736364,999.0
31,2018.0,3.0,11.5,0.0,235.041667,4.270833,7.7625,7.883333,999.0


<a id='2015-2018'></a>
### Combining all of the data for 2015-2018.

#### Missing data
 - Lot's from 2015
 - [April 5-9, 2016](#april5-9,2016)
 - [January 27-31, 2018](#Jan27-31,2018)
 - [February 1-23, 25-28, 2018](#feb1-23,25-28,2-18)
 - [March 1-29, 2018](#mar1-29,2018)

In [109]:
data_2015 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015.csv')
data_2016 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2016.csv')
data_2017 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2017.csv')
data_2018 = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2018.csv')

In [110]:
data = pd.concat([data_2015, data_2016, data_2017, data_2018], axis=0)
print(data.shape)
data.head()

(28403, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,3,23,20,0,79,8.5,99.0,-0.6,999
1,2015,3,23,21,0,78,7.9,99.0,-0.4,999
2,2015,3,23,22,0,82,7.3,99.0,-0.2,999
3,2015,3,23,23,0,83,6.3,99.0,0.0,999
4,2015,3,24,0,0,78,7.7,99.0,0.1,999


In [111]:
# saving the full dataframe
data.to_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015_2018.csv', index=False)

In [112]:
data = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015_2018.csv')
print(data.shape)
data.head()

(28403, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,3,23,20,0,79,8.5,99.0,-0.6,999
1,2015,3,23,21,0,78,7.9,99.0,-0.4,999
2,2015,3,23,22,0,82,7.3,99.0,-0.2,999
3,2015,3,23,23,0,83,6.3,99.0,0.0,999
4,2015,3,24,0,0,78,7.7,99.0,0.1,999


In [113]:
daily_averages = data.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)
print(daily_averages.shape)
daily_averages.head()

(1212, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015,3,23,80.5,7.5,99.0,-0.3,999.0
2015,3,24,107.25,5.258333,99.0,0.2625,999.0
2015,3,25,218.782609,5.891304,99.0,4.995652,999.0
2015,3,26,232.173913,2.76087,99.0,3.734783,999.0
2015,3,27,51.318182,7.209091,99.0,-0.995455,999.0


In [114]:
daily_averages.to_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015_2018_averages.csv')

In [115]:
daily_data = pd.read_csv('../data/buoy_JAKI2/jaki2_buoy_data_2015_2018_averages.csv')
print(daily_data.shape)
daily_data.head()

(1212, 8)


Unnamed: 0,#YY,MM,DD,WDIR,WSPD,GST,ATMP,DEWP
0,2015,3,23,80.5,7.5,99.0,-0.3,999.0
1,2015,3,24,107.25,5.258333,99.0,0.2625,999.0
2,2015,3,25,218.782609,5.891304,99.0,4.995652,999.0
3,2015,3,26,232.173913,2.76087,99.0,3.734783,999.0
4,2015,3,27,51.318182,7.209091,99.0,-0.995455,999.0
