# Exploring the data for Buoy CHII2 for the year 2005 - 2018.

GLERL  
Location: 41.916N 87.572W  

East of Chicago

### Table of contents

Exploring the data:  
[2005](#2005), 
[2006](#2006), 
[2007](#2007),  
[2008](#2008)  
[2009](#2009)  
[2010](#2010)  
[2011](#2011)  
[2012](#2012)  
[2013](#2013)  
[2014](#2014)  
[2015](#2015),
[2016](#2016), 
[2017](#2017), 
[2018](#2018), 
[2019](#2019)  

<a id='2005'></a>
### Exploring the data for 2005.

### CHII2
#### Starting Oct 1, 2006, there are daily averages for the following
 - (ATMP) air temperature 
 - (WDIR) Wind Direction 
 - (WSPD) Wind Speed 
 - (GST) "Peak 5 or 8 second gust speed (m/s) measured during the eight-minute or two-minute period. The 5 or 8 second period can be determined by payload, See the Sensor Reporting, Sampling, and Accuracy section." 
 
#### Missing data
 - Some in 2005
 - Some in 2006 (before 10)
 - Nov. 10-12, 2007
 
[Data Dictionary](https://www.ndbc.noaa.gov/measdes.shtml)

In [3]:
import pandas as pd
import time

In [2]:
data_05 = pd.read_csv('../data/buoy_CHII2/chii2h2005.csv')
print(data_05.shape)
data_05.head()

(6804, 18)


Unnamed: 0,YYYY,MM,DD,hh,mm,WD,WSP,D GST,WVHT,DPD,APD,MWD,BAR,ATMP,WTMP,DEWP,VIS,TIDE
0,2005,2,14,20,0,300,9.3,11.8,99,99,99,999,9999,3.8,999,999,99,99
1,2005,2,14,21,0,290,10.8,12.9,99,99,99,999,9999,4.6,999,999,99,99
2,2005,2,14,22,0,280,11.8,14.4,99,99,99,999,9999,5.1,999,999,99,99
3,2005,2,14,23,0,290,11.8,14.4,99,99,99,999,9999,4.7,999,999,99,99
4,2005,2,15,0,0,280,9.3,11.3,99,99,99,999,9999,4.7,999,999,99,99


In [3]:
data_05.rename(mapper={
    'YYYY': '#YY',
    'WD': 'WDIR',
    'WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)

In [4]:
data_05.isnull().sum()

#YY     0
MM      0
DD      0
hh      0
mm      0
WDIR    0
WSPD    0
GST     0
WVHT    0
DPD     0
APD     0
MWD     0
BAR     0
ATMP    0
WTMP    0
DEWP    0
VIS     0
TIDE    0
dtype: int64

In [5]:
# there are 28 fields in the 'ATMP' feature that are not reported.
sum(data_05['ATMP'] == 999)

28

In [6]:
data_05[data_05['ATMP'] == 999]

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,BAR,ATMP,WTMP,DEWP,VIS,TIDE
483,2005,3,8,11,0,360,8.2,10.3,99,99,99,999,9999,999.0,999,999,99,99
484,2005,3,8,12,0,360,9.8,11.3,99,99,99,999,9999,999.0,999,999,99,99
485,2005,3,8,13,0,340,9.3,10.8,99,99,99,999,9999,999.0,999,999,99,99
486,2005,3,8,14,0,350,8.2,9.3,99,99,99,999,9999,999.0,999,999,99,99
487,2005,3,8,15,0,350,8.2,9.8,99,99,99,999,9999,999.0,999,999,99,99
488,2005,3,8,16,0,360,8.2,9.8,99,99,99,999,9999,999.0,999,999,99,99
489,2005,3,8,17,0,10,9.8,10.8,99,99,99,999,9999,999.0,999,999,99,99
490,2005,3,8,18,0,20,7.7,9.8,99,99,99,999,9999,999.0,999,999,99,99
491,2005,3,8,19,0,30,7.7,9.3,99,99,99,999,9999,999.0,999,999,99,99
492,2005,3,8,20,0,30,7.2,8.8,99,99,99,999,9999,999.0,999,999,99,99


In [7]:
data_05.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
BAR       int64
ATMP    float64
WTMP      int64
DEWP      int64
VIS       int64
TIDE      int64
dtype: object

In [8]:
data_05.columns

Index(['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR', 'WSPD', 'GST', 'WVHT', 'DPD',
       'APD', 'MWD', 'BAR', 'ATMP', 'WTMP', 'DEWP', 'VIS', 'TIDE'],
      dtype='object')

In [9]:
features = ['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR', 'WSPD', 'GST', 'ATMP', 'DEWP']

In [10]:
data_2005 = data_05[features]

In [11]:
data_2005.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2005,2,14,20,0,300,9.3,11.8,3.8,999
1,2005,2,14,21,0,290,10.8,12.9,4.6,999
2,2005,2,14,22,0,280,11.8,14.4,5.1,999
3,2005,2,14,23,0,290,11.8,14.4,4.7,999
4,2005,2,15,0,0,280,9.3,11.3,4.7,999


In [12]:
data_2005.groupby(['#YY', 'MM', 'DD']).mean().drop('hh', axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mm,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005,2,14,0.0,290.000000,10.925000,13.375000,4.550000,999.0
2005,2,15,0.0,168.750000,4.912500,5.808333,3.520833,999.0
2005,2,16,0.0,289.583333,7.245833,8.420833,-0.620833,999.0
2005,2,17,0.0,305.000000,9.522727,11.295455,-4.413636,999.0
2005,2,18,0.0,262.916667,6.329167,7.683333,-7.570833,999.0
2005,2,19,0.0,273.333333,7.320833,8.808333,-1.733333,999.0
2005,2,20,0.0,175.000000,8.150000,9.333333,0.658333,999.0
2005,2,21,0.0,225.000000,5.083333,5.820833,1.416667,999.0
2005,2,22,0.0,230.714286,4.964286,5.464286,-0.042857,999.0
2005,2,23,0.0,219.583333,3.679167,4.170833,-1.133333,999.0


<a id='2006'></a>
### Exploring the data for 2006.

In [13]:
data_06 = pd.read_csv('../data/buoy_CHII2/chii2h2006.csv')
print(data_06.shape)
data_06.head()

(2182, 18)


Unnamed: 0,YYYY,MM,DD,hh,mm,WD,WSP,D GST,WVHT,DPD,APD,MWD,BAR,ATMP,WTMP,DEWP,VIS,TIDE
0,2006,3,1,0,0,280,12.4,14.4,99,99,99,999,9999,9.8,999,999,99,99
1,2006,3,30,14,0,190,4.6,5.7,99,99,99,999,9999,5.6,999,999,99,99
2,2006,3,30,16,0,180,6.7,7.2,99,99,99,999,9999,8.1,999,999,99,99
3,2006,3,30,17,0,180,8.8,9.3,99,99,99,999,9999,10.3,999,999,99,99
4,2006,3,30,18,0,190,8.8,10.3,99,99,99,999,9999,11.9,999,999,99,99


In [14]:
data_06.rename(mapper={
    'YYYY': '#YY',
    'WD': 'WDIR',
    'WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)

In [15]:
data_06.tail()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,BAR,ATMP,WTMP,DEWP,VIS,TIDE
2177,2006,12,31,17,0,140,10.3,12.9,99,99,99,999,9999,5.5,999,999,99,99
2178,2006,12,31,18,0,150,9.8,12.4,99,99,99,999,9999,5.7,999,999,99,99
2179,2006,12,31,20,0,150,8.8,10.3,99,99,99,999,9999,7.5,999,999,99,99
2180,2006,12,31,21,0,160,7.2,8.8,99,99,99,999,9999,9.4,999,999,99,99
2181,2006,12,31,22,0,230,10.3,13.4,99,99,99,999,9999,13.6,999,999,99,99


In [16]:
data_2006 = data_06[features]
data_2006.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2006,3,1,0,0,280,12.4,14.4,9.8,999
1,2006,3,30,14,0,190,4.6,5.7,5.6,999
2,2006,3,30,16,0,180,6.7,7.2,8.1,999
3,2006,3,30,17,0,180,8.8,9.3,10.3,999
4,2006,3,30,18,0,190,8.8,10.3,11.9,999


In [17]:
pd.options.display.max_rows = 500

In [18]:
data_2006.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).tail(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006,7,12,310.434783,5.556522,6.295652,19.695652,999.0
2006,7,13,185.416667,2.991667,3.575,20.6125,999.0
2006,7,14,130.869565,4.695652,5.521739,23.043478,999.0
2006,7,15,201.111111,4.233333,4.911111,26.15,999.0
2006,7,16,220.0,5.568182,6.568182,29.077273,999.0
2006,7,17,199.285714,8.714286,10.678571,29.528571,999.0
2006,7,18,124.285714,9.942857,11.842857,27.428571,999.0
2006,7,19,76.0,6.72,7.633333,23.04,999.0
2006,10,1,193.0,4.19,4.905,15.79,999.0
2006,10,2,142.222222,9.811111,11.705556,19.527778,999.0


<a id='2007'></a>
### Exploring the data for 2007.
 - 10/10-12/2007 are missing

In [19]:
data_07 = pd.read_csv('../data/buoy_CHII2/chii2h2007.csv')
print(data_07.shape)
data_07.head()

(6697, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,deg,T m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,nmi,ft
1,2007,1,1,0,0,210,11.8,17,99,99,99,999,9999,11.8,999,999,99,99
2,2007,1,1,1,0,210,9.3,11.3,99,99,99,999,9999,10.6,999,999,99,99
3,2007,1,1,2,0,220,11.3,14.4,99,99,99,999,9999,10.2,999,999,99,99
4,2007,1,1,4,0,200,13.4,15.4,99,99,99,999,9999,8,999,999,99,99


In [20]:
sum(data_07['ATMP'] == 999)

0

In [21]:
sum(data_07['WDI'] == 999)

0

In [22]:
sum(data_07['R WSP'] == 999)

0

In [23]:
data_07.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,deg,T m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,nmi,ft
1,2007,1,1,0,0,210,11.8,17,99,99,99,999,9999,11.8,999,999,99,99
2,2007,1,1,1,0,210,9.3,11.3,99,99,99,999,9999,10.6,999,999,99,99
3,2007,1,1,2,0,220,11.3,14.4,99,99,99,999,9999,10.2,999,999,99,99
4,2007,1,1,4,0,200,13.4,15.4,99,99,99,999,9999,8,999,999,99,99


In [24]:
data_07.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)

In [25]:
data_07.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,deg,T m/s,m/s,m,sec,sec,deg,T hPa,degC,degC,degC,nmi,ft
1,2007,1,1,0,0,210,11.8,17,99,99,99,999,9999,11.8,999,999,99,99
2,2007,1,1,1,0,210,9.3,11.3,99,99,99,999,9999,10.6,999,999,99,99
3,2007,1,1,2,0,220,11.3,14.4,99,99,99,999,9999,10.2,999,999,99,99
4,2007,1,1,4,0,200,13.4,15.4,99,99,99,999,9999,8,999,999,99,99


In [26]:
data_2007 = data_07[features]

In [27]:
data_2007.drop(index=0, inplace=True)
print(data_2007.shape)
data_2007.head()

(6696, 10)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2007,1,1,0,0,210,11.8,17.0,11.8,999
2,2007,1,1,1,0,210,9.3,11.3,10.6,999
3,2007,1,1,2,0,220,11.3,14.4,10.2,999
4,2007,1,1,4,0,200,13.4,15.4,8.0,999
5,2007,1,1,5,0,220,10.8,16.0,7.5,999


In [28]:
data_2007.dtypes

#YY     object
MM      object
DD      object
hh      object
mm      object
WDIR    object
WSPD    object
GST     object
ATMP    object
DEWP    object
dtype: object

In [29]:
data_2007.shape

(6696, 10)

In [30]:
data_2007[['WSPD', 'GST', 'ATMP']] = data_2007[['WSPD', 'GST', 'ATMP']].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [31]:
data_2007.shape

(6696, 10)

In [32]:
data_2007.dtypes

#YY      object
MM       object
DD       object
hh       object
mm       object
WDIR     object
WSPD    float64
GST     float64
ATMP    float64
DEWP     object
dtype: object

In [33]:
data_2007[['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR']] = data_2007[['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR']].astype('int')

In [34]:
data_2007.shape

(6696, 10)

In [35]:
data_2007.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
ATMP    float64
DEWP     object
dtype: object

In [36]:
data_2007[data_2007['MM'] == 12]

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
6064,2007,12,1,0,0,260,7.7,9.8,-3.0,999
6065,2007,12,1,1,0,280,7.7,9.8,-3.7,999
6066,2007,12,1,2,0,280,6.7,7.7,-4.0,999
6067,2007,12,1,3,0,290,6.2,7.7,-4.2,999
6068,2007,12,1,4,0,290,5.7,7.2,-4.4,999
6069,2007,12,1,6,0,320,5.1,6.2,-4.7,999
6070,2007,12,1,8,0,320,3.1,4.1,-4.5,999
6071,2007,12,1,9,0,270,3.6,4.6,-4.3,999
6072,2007,12,1,10,0,300,2.1,3.1,-4.2,999
6073,2007,12,1,11,0,320,2.1,3.1,-4.0,999


In [37]:
data_07 = data_2007.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)

In [38]:
data_07.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007,1,1,258.333333,9.95,12.933333,5.316667
2007,1,2,254.0,5.715,6.585,2.43
2007,1,3,198.823529,10.717647,12.935294,3.541176
2007,1,4,194.117647,12.870588,15.458824,5.852941
2007,1,5,222.5,7.385,8.775,8.41


In [39]:
data_07.shape

(362, 4)

In [40]:
# It is strange that there are only 362 rows in the grouped data above.  

data_2007['MM'] #['DD' == 26]

1        1
2        1
3        1
4        1
5        1
6        1
7        1
8        1
9        1
10       1
11       1
12       1
13       1
14       1
15       1
16       1
17       1
18       1
19       1
20       1
21       1
22       1
23       1
24       1
25       1
26       1
27       1
28       1
29       1
30       1
31       1
32       1
33       1
34       1
35       1
36       1
37       1
38       1
39       1
40       1
41       1
42       1
43       1
44       1
45       1
46       1
47       1
48       1
49       1
50       1
51       1
52       1
53       1
54       1
55       1
56       1
57       1
58       1
59       1
60       1
61       1
62       1
63       1
64       1
65       1
66       1
67       1
68       1
69       1
70       1
71       1
72       1
73       1
74       1
75       1
76       1
77       1
78       1
79       1
80       1
81       1
82       1
83       1
84       1
85       1
86       1
87       1
88       1
89       1
90       1
91       1

In [41]:
data_2007['DD'].value_counts(ascending=True)

31    116
29    196
10    198
12    202
11    203
9     207
4     207
30    211
15    213
13    213
8     213
28    214
6     217
26    218
27    220
2     221
16    222
5     223
17    224
25    224
14    225
22    225
24    226
20    230
18    231
7     231
3     232
19    232
1     232
23    233
21    237
Name: DD, dtype: int64

In [42]:
# this shows that the there are three days missing in November.
for i in range(1, 13):
    print(i, len(data_2007[data_2007['MM'] == i]['DD'].value_counts()))

1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 27
12 31


In [43]:
nov_2007 = data_2007[data_2007['MM'] == 11.0]

In [44]:
# showing that days 10-12 in November are not represented.  We might try and populate this information with
# other data.
for i in range(1, 31):
    print(i, len(nov_2007[nov_2007['DD'] == i]))

1 24
2 20
3 22
4 20
5 20
6 20
7 20
8 17
9 5
10 0
11 0
12 0
13 7
14 21
15 14
16 22
17 21
18 21
19 23
20 22
21 22
22 20
23 22
24 21
25 22
26 22
27 17
28 22
29 21
30 22


In [45]:
data_2007.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
ATMP    float64
DEWP     object
dtype: object

In [46]:
data_2007.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2007,1,1,0,0,210,11.8,17.0,11.8,999
2,2007,1,1,1,0,210,9.3,11.3,10.6,999
3,2007,1,1,2,0,220,11.3,14.4,10.2,999
4,2007,1,1,4,0,200,13.4,15.4,8.0,999
5,2007,1,1,5,0,220,10.8,16.0,7.5,999


<a id='2008'></a>
### Exploring the data for 2008.

In [47]:
data_08 = pd.read_csv('../data/buoy_CHII2/chii2h2008.csv')
print(data_08.shape)
data_08.head()

(5138, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,deg,T m/s,m/s,m,sec,sec,deg,hPa,degC,degC,degC,nmi,ft
1,2008,1,1,0,0,140,4.1,5.1,99,99,99,999,9999,-0.3,999,999,99,99
2,2008,1,1,1,0,100,3.1,3.6,99,99,99,999,9999,-0.4,999,999,99,99
3,2008,1,1,2,0,50,4.1,4.6,99,99,99,999,9999,-0.4,999,999,99,99
4,2008,1,1,3,0,340,4.6,5.1,99,99,99,999,9999,-0.4,999,999,99,99


In [48]:
data_08.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_08.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,#yr,mo,dy,hr,mn,deg,T m/s,m/s,m,sec,sec,deg,hPa,degC,degC,degC,nmi,ft
1,2008,1,1,0,0,140,4.1,5.1,99,99,99,999,9999,-0.3,999,999,99,99
2,2008,1,1,1,0,100,3.1,3.6,99,99,99,999,9999,-0.4,999,999,99,99
3,2008,1,1,2,0,50,4.1,4.6,99,99,99,999,9999,-0.4,999,999,99,99
4,2008,1,1,3,0,340,4.6,5.1,99,99,99,999,9999,-0.4,999,999,99,99


In [49]:
data_08.dtypes

#YY     object
MM      object
DD      object
hh      object
mm      object
WDIR    object
WSPD    object
GST     object
WVHT    object
DPD     object
APD     object
MWD     object
PRES    object
ATMP    object
WTMP    object
DEWP    object
VIS     object
TIDE    object
dtype: object

In [50]:
data_08 = data_08.drop(index=0)

In [51]:
data_08.head(100)

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
1,2008,1,1,0,0,140,4.1,5.1,99,99,99,999,9999,-0.3,999,999,99,99
2,2008,1,1,1,0,100,3.1,3.6,99,99,99,999,9999,-0.4,999,999,99,99
3,2008,1,1,2,0,50,4.1,4.6,99,99,99,999,9999,-0.4,999,999,99,99
4,2008,1,1,3,0,340,4.6,5.1,99,99,99,999,9999,-0.4,999,999,99,99
5,2008,1,1,4,0,310,6.7,7.7,99,99,99,999,9999,-0.4,999,999,99,99
6,2008,1,1,5,0,330,13.9,16.5,99,99,99,999,9999,-0.9,999,999,99,99
7,2008,1,1,10,0,300,11.8,15.4,99,99,99,999,9999,-4.9,999,999,99,99
8,2008,1,1,11,0,300,11.3,14.4,99,99,99,999,9999,-4.9,999,999,99,99
9,2008,1,1,12,0,290,10.3,12.4,99,99,99,999,9999,-4.8,999,999,99,99
10,2008,1,1,14,0,280,10.8,12.9,99,99,99,999,9999,-5.7,999,999,99,99


In [52]:
data_08 = data_08[features]
data_08[['WSPD', 'GST', 'ATMP']] = data_08[['WSPD', 'GST', 'ATMP']].astype('float')
data_08[['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR']] = data_08[['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR']].astype('int')
data_2008 = data_08[features]
print(data_2008.shape)
data_2008.head()

(5137, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
1,2008,1,1,0,0,140,4.1,5.1,-0.3,999
2,2008,1,1,1,0,100,3.1,3.6,-0.4,999
3,2008,1,1,2,0,50,4.1,4.6,-0.4,999
4,2008,1,1,3,0,340,4.6,5.1,-0.4,999
5,2008,1,1,4,0,310,6.7,7.7,-0.4,999


In [53]:
data_08.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,1,1,266.315789,9.052632,10.857895,-4.078947
2008,1,2,291.578947,9.252632,11.521053,-10.663158
2008,1,3,229.411765,8.576471,10.205882,-9.817647
2008,1,4,190.5,12.5,15.035,-3.05
2008,1,5,189.285714,11.471429,13.685714,3.685714
2008,1,6,180.952381,10.033333,11.37619,8.238095
2008,1,7,201.5,11.84,13.775,14.71
2008,1,8,172.666667,10.546667,12.266667,10.613333
2008,1,9,254.5,7.815,9.745,2.135
2008,1,10,117.777778,5.883333,6.616667,2.633333


In [54]:
data_2008.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
ATMP    float64
DEWP     object
dtype: object

In [55]:
# this shows that the there are missing values January, February, March, April, May, and December.
nov_2008 = data_2008[data_2008['MM'] == 11.0]
for i in range(1, 13):
    print(i, len(data_2008[data_2008['MM'] == i]['DD'].value_counts()))

1 20
2 0
3 0
4 0
5 0
6 30
7 31
8 31
9 30
10 31
11 30
12 24


In [56]:
nov_2008 = data_2008[data_2008['MM'] == 11]

In [57]:
# There are no missing values in November.
for i in range(1, 31):
    print(i, len(nov_2008[nov_2008['DD'] == i]))

1 23
2 22
3 24
4 23
5 24
6 23
7 24
8 23
9 22
10 24
11 13
12 22
13 24
14 22
15 21
16 24
17 21
18 24
19 23
20 23
21 23
22 24
23 22
24 24
25 24
26 24
27 22
28 22
29 23
30 24


<a id='2009'></a>

### Exploring the data for 2009.

In [58]:
data_09 = pd.read_csv('../data/buoy_CHII2/chii2h2009.csv')
print(data_09.shape)
data_09.head()

(8288, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2009,1,1,0,0,280,2.6,3.6,99,99,99,999,9999,-6.6,999,999,99,99
1,2009,1,1,1,0,260,3.1,4.1,99,99,99,999,9999,-6.7,999,999,99,99
2,2009,1,1,2,0,270,4.1,5.1,99,99,99,999,9999,-7.3,999,999,99,99
3,2009,1,1,3,0,270,4.6,5.7,99,99,99,999,9999,-7.7,999,999,99,99
4,2009,1,1,4,0,260,2.6,4.6,99,99,99,999,9999,-7.4,999,999,99,99


In [59]:
data_09.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_09.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2009,1,1,0,0,280,2.6,3.6,99,99,99,999,9999,-6.6,999,999,99,99
1,2009,1,1,1,0,260,3.1,4.1,99,99,99,999,9999,-6.7,999,999,99,99
2,2009,1,1,2,0,270,4.1,5.1,99,99,99,999,9999,-7.3,999,999,99,99
3,2009,1,1,3,0,270,4.6,5.7,99,99,99,999,9999,-7.7,999,999,99,99
4,2009,1,1,4,0,260,2.6,4.6,99,99,99,999,9999,-7.4,999,999,99,99


### Exploring the data for 2010.

In [60]:
data_10 = pd.read_csv('../data/buoy_CHII2/chii2h2010.csv')
print(data_10.shape)
data_10.head()

(8466, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2010,1,1,0,0,320,11.8,13.9,99,99,99,999,9999,-7.5,999,999,99,99
1,2010,1,1,1,0,300,9.8,12.9,99,99,99,999,9999,-8.5,999,999,99,99
2,2010,1,1,2,0,300,10.3,13.4,99,99,99,999,9999,-8.5,999,999,99,99
3,2010,1,1,3,0,300,10.3,13.4,99,99,99,999,9999,-9.0,999,999,99,99
4,2010,1,1,4,0,300,9.3,12.4,99,99,99,999,9999,-9.5,999,999,99,99


In [61]:
data_10.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_10.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2010,1,1,0,0,320,11.8,13.9,99,99,99,999,9999,-7.5,999,999,99,99
1,2010,1,1,1,0,300,9.8,12.9,99,99,99,999,9999,-8.5,999,999,99,99
2,2010,1,1,2,0,300,10.3,13.4,99,99,99,999,9999,-8.5,999,999,99,99
3,2010,1,1,3,0,300,10.3,13.4,99,99,99,999,9999,-9.0,999,999,99,99
4,2010,1,1,4,0,300,9.3,12.4,99,99,99,999,9999,-9.5,999,999,99,99


### Exploring the data for 2011.

In [62]:
data_11 = pd.read_csv('../data/buoy_CHII2/chii2h2011.csv')
print(data_11.shape)
data_11.head()

(8499, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2011,1,1,0,0,180,12.4,13.4,99,99,99,999,9999,11.1,999,999,99,99
1,2011,1,1,1,0,180,9.8,11.3,99,99,99,999,9999,9.8,999,999,99,99
2,2011,1,1,2,0,200,10.8,11.3,99,99,99,999,9999,11.8,999,999,99,99
3,2011,1,1,3,0,190,10.3,11.3,99,99,99,999,9999,10.9,999,999,99,99
4,2011,1,1,4,0,200,11.3,12.9,99,99,99,999,9999,12.2,999,999,99,99


In [63]:
data_11.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_11.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2011,1,1,0,0,180,12.4,13.4,99,99,99,999,9999,11.1,999,999,99,99
1,2011,1,1,1,0,180,9.8,11.3,99,99,99,999,9999,9.8,999,999,99,99
2,2011,1,1,2,0,200,10.8,11.3,99,99,99,999,9999,11.8,999,999,99,99
3,2011,1,1,3,0,190,10.3,11.3,99,99,99,999,9999,10.9,999,999,99,99
4,2011,1,1,4,0,200,11.3,12.9,99,99,99,999,9999,12.2,999,999,99,99


### Exploring the data for 2012.

In [64]:
data_12 = pd.read_csv('../data/buoy_CHII2/chii2h2012.csv')
print(data_12.shape)
data_12.head()

(8589, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDI,R WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2012,1,1,0,0,170,12.9,16.0,99,99,99,999,9999,2.9,999,999,99,99
1,2012,1,1,1,0,170,13.9,16.5,99,99,99,999,9999,3.0,999,999,99,99
2,2012,1,1,2,0,170,14.4,16.5,99,99,99,999,9999,3.2,999,999,99,99
3,2012,1,1,3,0,180,12.4,14.4,99,99,99,999,9999,3.7,999,999,99,99
4,2012,1,1,4,0,170,9.3,9.8,99,99,99,999,9999,3.9,999,999,99,99


In [65]:
data_12.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_12.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2012,1,1,0,0,170,12.9,16.0,99,99,99,999,9999,2.9,999,999,99,99
1,2012,1,1,1,0,170,13.9,16.5,99,99,99,999,9999,3.0,999,999,99,99
2,2012,1,1,2,0,170,14.4,16.5,99,99,99,999,9999,3.2,999,999,99,99
3,2012,1,1,3,0,180,12.4,14.4,99,99,99,999,9999,3.7,999,999,99,99
4,2012,1,1,4,0,170,9.3,9.8,99,99,99,999,9999,3.9,999,999,99,99


### Exploring the data for 2013.

In [66]:
data_13 = pd.read_csv('../data/buoy_CHII2/chii2h2013.csv')
print(data_13.shape)
data_13.head()

(30419, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSP,D GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2013,1,1,0,0,330,4.1,5.1,99,99,99,999,9999,-2.6,999,999,99,99
1,2013,1,1,1,0,330,5.1,6.2,99,99,99,999,9999,-3.0,999,999,99,99
2,2013,1,1,2,0,340,8.8,10.3,99,99,99,999,9999,-3.7,999,999,99,99
3,2013,1,1,3,0,350 1,0.3,12.4,99,99,99,999,9999,-5.1,999,999,99,99
4,2013,1,1,4,0,330,9.3,10.8,99,99,99,999,9999,-5.6,999,999,99,99


In [67]:
data_13.rename(mapper={
    'WDI': 'WDIR',
    'R WSP': 'WSPD',
    'D GST': 'GST'
}, axis=1, inplace=True)
data_13.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSP,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2013,1,1,0,0,330,4.1,5.1,99,99,99,999,9999,-2.6,999,999,99,99
1,2013,1,1,1,0,330,5.1,6.2,99,99,99,999,9999,-3.0,999,999,99,99
2,2013,1,1,2,0,340,8.8,10.3,99,99,99,999,9999,-3.7,999,999,99,99
3,2013,1,1,3,0,350 1,0.3,12.4,99,99,99,999,9999,-5.1,999,999,99,99
4,2013,1,1,4,0,330,9.3,10.8,99,99,99,999,9999,-5.6,999,999,99,99


In [68]:
data_13.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSP     float64
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
PRES      int64
ATMP    float64
WTMP      int64
DEWP      int64
VIS       int64
TIDE      int64
dtype: object

<a id='2014'></a>
### Exploring the data for 2014.

### Table of contents

Exploring the data:  
[2005](#2005), 
[2006](#2006), 
[2007](#2007),  
[2008](#2008)  
[2009](#2009)  
[2010](#2010)  
[2011](#2011)  
[2012](#2012)  
[2013](#2013)  
[2014](#2014), 
[2015](#2015),
[2016](#2016), 
[2017](#2017), 
[2018](#2018), 
[2019](#2019) 

#### Missing data
 - [Dec 5-13, 2015](#Dec5-13,2015)
 - [Nov. 30, 2016](#Nov30,2016)
 - [Dec 25, 2017](#Dec25,2017)
 - [Dec. 27-31, 2018](#Dec27-31,2018)

In [69]:
data_14 = pd.read_csv('../data/buoy_CHII2/chii2h2014.csv')
print(data_14.shape)
data_14.head()

(33141, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2014,1,1,0,0,280,2.6,3.6,99,99,99,999,9999,-10.4,999,999,99,99
1,2014,1,1,0,30,310,2.1,3.6,99,99,99,999,9999,-10.4,999,999,99,99
2,2014,1,1,0,45,290,3.1,4.1,99,99,99,999,9999,-10.4,999,999,99,99
3,2014,1,1,1,0,280,3.1,3.6,99,99,99,999,9999,-10.3,999,999,99,99
4,2014,1,1,1,15,290,3.1,4.1,99,99,99,999,9999,-10.3,999,999,99,99


In [70]:
data_14.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD     object
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
PRES      int64
ATMP    float64
WTMP      int64
DEWP      int64
VIS       int64
TIDE      int64
dtype: object

### Cleaning the 'WDIR' feature

In [137]:
series = data_14['WDIR']

In [139]:
data0 = []
j = 0 
for i in range(len(data_14['WDIR'])):
    if i % 1000 == 0:
        j += i
        print(f'Checking item {i}')
    
    if len(series.iloc[i].split()) == 1:
        pass
    else:
        data0 = series.iloc[i]
        data1 = data0.split()
        series.iloc[i] = data1[0]

Checking item 0
Checking item 1000
Checking item 2000
Checking item 3000
Checking item 4000
Checking item 5000
Checking item 6000
Checking item 7000
Checking item 8000
Checking item 9000
Checking item 10000
Checking item 11000
Checking item 12000
Checking item 13000
Checking item 14000
Checking item 15000
Checking item 16000
Checking item 17000
Checking item 18000
Checking item 19000
Checking item 20000
Checking item 21000
Checking item 22000
Checking item 23000
Checking item 24000
Checking item 25000
Checking item 26000
Checking item 27000
Checking item 28000
Checking item 29000
Checking item 30000
Checking item 31000
Checking item 32000
Checking item 33000


In [140]:
data_14['WDIR'] = series
data_14['WDIR'].head()

0    280
1    310
2    290
3    280
4    290
Name: WDIR, dtype: object

In [143]:
for i in range(len(data_14['WDIR'])):
    if len(data_14['WDIR'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_14['WDIR'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [144]:
data_14['WDIR'] = data_14['WDIR'].astype('int')

### Cleaning the 'WSPD' feature

In [166]:
data_14['WSPD'].iloc[35: 50]

35      6.7
36      7.2
37      7.7
38      7.2
39      8.2
40    9.8 1
41    9.8 1
42    8.8 1
43    9.3 1
44    9.3 1
45    9.8 1
46    0.8 1
47    0.3 1
48    0.3 1
49    0.3 1
Name: WSPD, dtype: object

In [167]:
wspd = data_14['WSPD']

In [170]:
start_time = time.time()

data0 = []
j = 0 
for i in range(len(wspd)):
    if i % 1000 == 0:
        j += i
        print(f'{100 - 100 * i/ len(wspd)}% left until complete')
    
    if len(wspd.iloc[i].split()) == 1:
        pass
    else:
        data0 = wspd.iloc[i]
        data1 = data0.split()
        wspd.iloc[i] = data1[0]
        
end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)  

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

100.0% left until complete
96.98258954165536% left until complete
93.9651790833107% left until complete
90.94776862496606% left until complete
87.9303581666214% left until complete
84.91294770827676% left until complete
81.8955372499321% left until complete
78.87812679158746% left until complete
75.8607163332428% left until complete
72.84330587489816% left until complete
69.82589541655352% left until complete
66.80848495820887% left until complete
63.791074499864216% left until complete
60.77366404151957% left until complete
57.75625358317492% left until complete
54.73884312483027% left until complete
51.72143266648562% left until complete
48.70402220814098% left until complete
45.68661174979633% left until complete
42.66920129145168% left until complete
39.65179083310703% left until complete
36.63438037476238% left until complete
33.61696991641773% left until complete
30.599559458073088% left until complete
27.58214899972843% left until complete
24.56473854138379% left until complete


In [171]:
data_14['WSPD'] = wspd
data_14['WSPD'].head()

0    2.6
1    2.1
2    3.1
3    3.1
4    3.1
Name: WSPD, dtype: object

In [175]:
for i in range(len(data_14['WSPD'])):
    if len(data_14['WSPD'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_14['WSPD'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [177]:
data_14['WSPD'] = data_14['WSPD'].astype('float')

In [178]:
data_2014 = data_14[features]

In [179]:
# saving the 2017 chii2 buoy data to a csv.
data_2014.to_csv('../data/buoy_CHII2/chii2_buoy_data_2014.csv')

In [180]:
# print(len(data_2017.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)))
data_2014.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014,1,1,113.294118,4.143529,4.822353,-7.041176,999.0
2014,1,2,69.512195,2.452439,3.840244,-6.847561,999.0
2014,1,3,253.292683,5.176829,5.54878,-12.076829,999.0
2014,1,4,194.597701,5.387356,4.114943,-4.341379,999.0
2014,1,5,316.626506,5.042169,4.374699,-4.050602,999.0


In [181]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2014[data_2017['MM'] == i]['DD'].value_counts()))

1 31
2 31
3 30
4 31
5 31
6 31
7 31
8 31
9 0
10 0
11 0
12 0


  This is separate from the ipykernel package so we can avoid doing imports until


<a id='2015'></a>
### Exploring the data for 2015.

#### Missing data
[Dec 5-13, 2015](#Dec5-13,2015)

In [71]:
data_15 = pd.read_csv('../data/buoy_CHII2/chii2h2015.csv')
print(data_15.shape)
data_15.head()

(34570, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2015,1,1,0,0,220,10.8,13.9,99,99,99,999,9999,-7.2,999,999.0,99,99
1,2015,1,1,0,15,220,11.8,15.4,99,99,99,999,9999,-7.2,999,999.0,99,99
2,2015,1,1,0,30,220,11.8,17.0,99,99,99,999,9999,-7.1,999,999.0,99,99
3,2015,1,1,0,45,220,12.4,17.0,99,99,99,999,9999,-6.9,999,999.0,99,99
4,2015,1,1,1,0,220,12.9,19.0,99,99,99,999,9999,-7.0,999,999.0,99,99


In [72]:
data_15.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
PRES      int64
ATMP    float64
WTMP      int64
DEWP    float64
VIS       int64
TIDE      int64
dtype: object

In [112]:
data_2015 = data_15[features]
print(data_2015.shape)
data_2015.head()

(34570, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,1,1,0,0,220,10.8,13.9,-7.2,999.0
1,2015,1,1,0,15,220,11.8,15.4,-7.2,999.0
2,2015,1,1,0,30,220,11.8,17.0,-7.1,999.0
3,2015,1,1,0,45,220,12.4,17.0,-6.9,999.0
4,2015,1,1,1,0,220,12.9,19.0,-7.0,999.0


In [113]:
print(len(data_2015.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)))
data_2015.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

356


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015,1,1,232.083333,10.191667,14.289583,-6.083333,999.0
2015,1,2,220.0,5.33125,6.541667,-2.022917,999.0
2015,1,3,168.229167,4.597917,5.357292,0.307292,999.0
2015,1,4,314.0625,7.702083,9.019792,-1.535417,999.0
2015,1,5,270.625,9.589583,12.18125,-15.46875,999.0


In [114]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2015[data_2015['MM'] == i]['DD'].value_counts()))

1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 22


<a id='Dec5-13,2015'></a>

In [116]:
# missing data, Dec. 5-13, 2015
data_2015.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015,12,3,276.875,7.585417,9.529167,1.207292,999.0
2015,12,4,236.557377,6.57541,8.780328,1.632787,999.0
2015,12,14,203.125,10.6625,12.76875,9.83125,999.0
2015,12,15,187.301587,5.646825,6.633333,6.005556,832.679365
2015,12,16,151.532847,10.159124,11.724818,7.272263,4.477372
2015,12,17,247.021277,10.13617,11.895035,1.838298,-3.85461
2015,12,18,259.71831,8.914085,10.582394,-0.924648,-7.611268
2015,12,19,264.861111,8.551389,9.996528,-3.352778,-9.75625
2015,12,20,195.138889,11.464583,12.726389,3.779861,-3.752778
2015,12,21,214.236111,12.613194,14.398611,8.756944,5.384722


In [119]:
data_2015.head()

Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,1,1,0,0,220,10.8,13.9,-7.2,999.0
1,2015,1,1,0,15,220,11.8,15.4,-7.2,999.0
2,2015,1,1,0,30,220,11.8,17.0,-7.1,999.0
3,2015,1,1,0,45,220,12.4,17.0,-6.9,999.0
4,2015,1,1,1,0,220,12.9,19.0,-7.0,999.0


<a id='2016'></a>
### Exploring the data for 2016.

#### Missing data
[Nov. 30, 2016](#Nov30,2016)

In [73]:
data_16 = pd.read_csv('../data/buoy_CHII2/chii2h2016.csv')
print(data_16.shape)
data_16.head()

(51583, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2016,1,1,0,0,250,9.8,11.3,99,99,99,999,9999,-2.4,999,-6.9,99,99
1,2016,1,1,0,10,240,8.8,10.8,99,99,99,999,9999,-2.4,999,-6.7,99,99
2,2016,1,1,0,20,270,8.8,10.3,99,99,99,999,9999,-2.2,999,-6.3,99,99
3,2016,1,1,0,30,260,7.2,8.2,99,99,99,999,9999,-2.2,999,-6.1,99,99
4,2016,1,1,0,40,270,8.2,9.3,99,99,99,999,9999,-2.4,999,-6.3,99,99


In [74]:
data_16.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
PRES      int64
ATMP    float64
WTMP      int64
DEWP    float64
VIS       int64
TIDE      int64
dtype: object

In [75]:
data_2016 = data_16[features]
print(data_2016.shape)
data_2016.head()

(51583, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2016,1,1,0,0,250,9.8,11.3,-2.4,-6.9
1,2016,1,1,0,10,240,8.8,10.8,-2.4,-6.7
2,2016,1,1,0,20,270,8.8,10.3,-2.2,-6.3
3,2016,1,1,0,30,260,7.2,8.2,-2.2,-6.1
4,2016,1,1,0,40,270,8.2,9.3,-2.4,-6.3


In [76]:
print(len(data_2016.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)))
data_2016.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

364


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016,1,1,252.0,9.413571,11.295714,-3.38,-7.514286
2016,1,2,248.601399,7.246853,8.623077,-0.991608,-5.634266
2016,1,3,300.567376,9.843972,11.155319,-1.019149,-5.549645
2016,1,4,221.111111,9.327778,10.625694,-1.409722,-6.588889
2016,1,5,161.875,7.882639,8.947222,-1.968056,-6.58125


In [77]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2016[data_2016['MM'] == i]['DD'].value_counts()))

1 31
2 29
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 29
12 30


<a id='Nov30,2016'></a>

In [78]:
# missing data, Nov. 30, 2016
data_2016.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).tail(31)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016,11,29,201.509434,11.606604,13.133962,10.160377,7.50566
2016,12,1,262.380952,9.968254,11.492063,4.822222,1.128571
2016,12,2,290.661765,6.722794,7.754412,4.061765,-0.605147
2016,12,3,290.0,4.265278,5.179861,1.820139,-4.464583
2016,12,4,190.069444,3.270833,4.004167,1.681944,-2.06875
2016,12,5,234.097222,6.974306,8.056944,1.435417,0.190972
2016,12,6,165.177305,6.448936,7.458156,3.535461,1.037589
2016,12,7,257.916667,10.124306,11.851389,-1.45,-6.64375
2016,12,8,271.736111,12.561806,14.530556,-4.947917,-9.446528
2016,12,9,282.553191,8.238298,9.552482,-3.839716,-8.42766


<a id='2017'></a>
### Exploring the data for 2017.

#### Missing data
[Dec 25, 2017](#Dec25,2017)

In [79]:
data_17 = pd.read_csv('../data/buoy_CHII2/chii2h2017.csv')
print(data_17.shape)
data_17.head()

(50474, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2017,1,1,0,0,280,8.8,10.3,99,99,99,999,9999,1.3,999,-6.2,99,99
1,2017,1,1,0,10,280,7.2,8.2,99,99,99,999,9999,1.2,999,-6.4,99,99
2,2017,1,1,0,20,280,7.2,7.7,99,99,99,999,9999,1.1,999,-6.2,99,99
3,2017,1,1,0,30,280,7.7,8.2,99,99,99,999,9999,1.0,999,-6.2,99,99
4,2017,1,1,0,40,280,7.7,8.8,99,99,99,999,9999,0.9,999,-6.3,99,99


In [80]:
data_17.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR     object
WSPD    float64
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
PRES      int64
ATMP    float64
WTMP      int64
DEWP    float64
VIS       int64
TIDE      int64
dtype: object

In [81]:
data_2017 = data_17[features]
print(data_2017.shape)
data_2017.head()

(50474, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2017,1,1,0,0,280,8.8,10.3,1.3,-6.2
1,2017,1,1,0,10,280,7.2,8.2,1.2,-6.4
2,2017,1,1,0,20,280,7.2,7.7,1.1,-6.2
3,2017,1,1,0,30,280,7.7,8.2,1.0,-6.2
4,2017,1,1,0,40,280,7.7,8.8,0.9,-6.3


### The 'WDIR' feature has some values in it with trailing 1.  We address this issue now.

In [82]:
# an example
data_2017['WDIR'].loc[50232] #.astype('int')

'290 1'

In [83]:
len(data_2017)

50474

In [84]:
data_2017['WDIR'].loc[50232].split()

['290', '1']

In [85]:
data_2017['WDIR'].loc[0]

'280'

In [86]:
data_2017['WDIR'].loc[0].split()

['280']

In [87]:
len(data_2017['WDIR'].loc[0].split())

1

In [88]:
for i in range(24000, 25000):
    
    print(i, data_2017['WDIR'].loc[i].split())

24000 ['290']
24001 ['290']
24002 ['290']
24003 ['290']
24004 ['280']
24005 ['280']
24006 ['280']
24007 ['280']
24008 ['290']
24009 ['280']
24010 ['280']
24011 ['280']
24012 ['280']
24013 ['270']
24014 ['270']
24015 ['270']
24016 ['260']
24017 ['260']
24018 ['270']
24019 ['270']
24020 ['270']
24021 ['260']
24022 ['260']
24023 ['260']
24024 ['260']
24025 ['260']
24026 ['270']
24027 ['270']
24028 ['260']
24029 ['270']
24030 ['280']
24031 ['270']
24032 ['270']
24033 ['270']
24034 ['280']
24035 ['280']
24036 ['280']
24037 ['280']
24038 ['280']
24039 ['280']
24040 ['280']
24041 ['290']
24042 ['280']
24043 ['280']
24044 ['270', '1']
24045 ['280']
24046 ['270']
24047 ['290']
24048 ['280']
24049 ['280']
24050 ['270']
24051 ['270']
24052 ['270']
24053 ['280', '1']
24054 ['290', '1']
24055 ['280']
24056 ['290']
24057 ['300']
24058 ['270']
24059 ['300']
24060 ['280', '1']
24061 ['290']
24062 ['290', '1']
24063 ['280']
24064 ['280']
24065 ['280']
24066 ['290']
24067 ['250']
24068 ['280', '1']
2406

24724 ['190']
24725 ['190']
24726 ['190']
24727 ['200']
24728 ['200']
24729 ['200']
24730 ['190']
24731 ['180']
24732 ['180']
24733 ['200']
24734 ['200']
24735 ['200']
24736 ['200']
24737 ['200']
24738 ['200']
24739 ['210']
24740 ['200']
24741 ['210']
24742 ['210']
24743 ['210']
24744 ['210']
24745 ['210']
24746 ['210']
24747 ['210']
24748 ['200']
24749 ['200']
24750 ['200']
24751 ['200']
24752 ['200']
24753 ['190']
24754 ['200']
24755 ['210']
24756 ['10']
24757 ['340']
24758 ['330']
24759 ['330']
24760 ['10']
24761 ['30']
24762 ['40']
24763 ['50']
24764 ['50']
24765 ['50']
24766 ['50']
24767 ['60']
24768 ['70']
24769 ['60']
24770 ['50']
24771 ['70']
24772 ['70']
24773 ['80']
24774 ['70']
24775 ['80']
24776 ['90']
24777 ['80']
24778 ['60']
24779 ['80']
24780 ['50']
24781 ['40']
24782 ['90']
24783 ['90']
24784 ['80']
24785 ['90']
24786 ['90']
24787 ['90']
24788 ['90']
24789 ['90']
24790 ['100']
24791 ['140']
24792 ['150']
24793 ['170']
24794 ['170']
24795 ['180']
24796 ['170']
24797 ['1

In [89]:
series = data_2017['WDIR']

In [92]:
# data = []
# j = 0 
# for i in range(len(data_2017['WDIR'])):
#     if i % 1000 == 0:
#         j += i
#         print(f'Checking item {i}')
    
#     if len(series.loc[i].split()) == 1:
#         pass
#     else:
#         data0 = series.loc[i]
#         data1 = data0.split()
#         series.loc[i] = data1[0]

Checking item 0
Checking item 1000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Checking item 2000
Checking item 3000
Checking item 4000
Checking item 5000
Checking item 6000
Checking item 7000
Checking item 8000
Checking item 9000
Checking item 10000
Checking item 11000
Checking item 12000
Checking item 13000
Checking item 14000
Checking item 15000
Checking item 16000
Checking item 17000
Checking item 18000
Checking item 19000
Checking item 20000
Checking item 21000
Checking item 22000
Checking item 23000
Checking item 24000
Checking item 25000
Checking item 26000
Checking item 27000
Checking item 28000
Checking item 29000
Checking item 30000
Checking item 31000
Checking item 32000
Checking item 33000
Checking item 34000
Checking item 35000
Checking item 36000
Checking item 37000
Checking item 38000
Checking item 39000
Checking item 40000
Checking item 41000
Checking item 42000
Checking item 43000
Checking item 44000
Checking item 45000
Checking item 46000
Checking item 47000
Checking item 48000
Checking item 49000
Checking item 50000


In [94]:
data_2017['WDIR'] = series
data_2017['WDIR'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    280
1    280
2    280
3    280
4    280
Name: WDIR, dtype: object

In [95]:
for i in range(len(data_2017['WDIR'])):
    if len(data_2017['WDIR'].loc[i].split()) == 1:
        pass
    else:
        print(i, data_2017['WDIR'].loc[i].split())
print('Everything checks out.')

Everything checks out.


In [97]:
data_2017['WDIR'] = data_2017['WDIR'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [151]:
# saving the 2017 chii2 buoy data to a csv.
data_2017.to_csv('../data/buoy_CHII2/chii2_buoy_data_2017.csv')

In [152]:
# print(len(data_2017.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)))
data_2017.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017,1,1,224.444444,5.077083,5.627083,-0.279861,-5.688889
2017,1,2,107.569444,5.1125,5.3625,3.128472,0.209028
2017,1,3,214.513889,4.586111,5.272222,3.626389,3.413194
2017,1,4,281.527778,2.915278,12.878472,-6.640972,-11.584028
2017,1,5,279.366197,6.039437,10.133803,-11.634507,-17.369718


In [153]:
# checking to see which months are missing days
for i in range(1, 13):
    print(i, len(data_2017[data_2017['MM'] == i]['DD'].value_counts()))

1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 30


<a id='Dec25,2017'></a>

In [101]:
# missing data, Dec. 25, 2017
data_2017.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017,12,21,109.444444,5.759722,7.068056,2.261806,-3.025694
2017,12,22,204.929577,4.599296,5.015493,3.185211,-0.066901
2017,12,23,334.305556,7.000694,7.984722,-0.15625,-4.715278
2017,12,24,247.387387,5.000901,5.863964,-3.881081,-8.551351
2017,12,26,270.277778,7.566667,8.941667,-14.719444,999.0
2017,12,27,297.535211,7.492254,8.773944,-15.364789,999.0
2017,12,28,242.361111,5.629167,6.679167,-12.113194,618.443056
2017,12,29,262.361111,6.245833,7.38125,-10.185417,-13.697222
2017,12,30,293.194444,5.498611,10.222222,-11.781944,-16.509722
2017,12,31,301.830986,7.321127,8.495775,-13.88169,-18.978169


<a id='2018'></a>
### Exploring the data for 2018.

#### Missing data

[Dec. 27-31, 2018](#Dec27-31,2018)

In [102]:
data_18 = pd.read_csv('../data/buoy_CHII2/chii2h2018.csv')
print(data_18.shape)
data_18.head()

(50269, 18)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,TIDE
0,2018,1,1,0,0,300,9.3,10.8,99,99,99,999,9999,-11.2,999,-18.6,99,99
1,2018,1,1,0,10,310,8.8,9.8,99,99,99,999,9999,-11.5,999,-18.6,99,99
2,2018,1,1,0,20,300,9.8,10.3,99,99,99,999,9999,-11.6,999,-18.5,99,99
3,2018,1,1,0,30,310,9.3,10.8,99,99,99,999,9999,-11.9,999,-18.8,99,99
4,2018,1,1,0,40,300,9.8,11.3,99,99,99,999,9999,-12.1,999,-18.4,99,99


In [103]:
data_18.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
WVHT      int64
DPD       int64
APD       int64
MWD       int64
PRES      int64
ATMP    float64
WTMP      int64
DEWP    float64
VIS       int64
TIDE      int64
dtype: object

In [104]:
data_2018 = data_18[features]
print(data_2018.shape)
data_2018.head()

(50269, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2018,1,1,0,0,300,9.3,10.8,-11.2,-18.6
1,2018,1,1,0,10,310,8.8,9.8,-11.5,-18.6
2,2018,1,1,0,20,300,9.8,10.3,-11.6,-18.5
3,2018,1,1,0,30,310,9.3,10.8,-11.9,-18.8
4,2018,1,1,0,40,300,9.8,11.3,-12.1,-18.4


In [105]:
print(len(data_2018.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1)))
data_2018.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

360


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018,1,1,291.785714,8.993571,10.382143,-16.937857,-22.438571
2018,1,2,257.0,7.611429,8.913571,-16.668571,-21.556429
2018,1,3,256.319444,10.136111,11.827083,-10.247917,-15.077083
2018,1,4,316.808511,8.17234,9.485106,-11.898582,-16.797872
2018,1,5,313.194444,8.295139,9.545139,-13.047222,-18.440972


In [106]:
for i in range(1, 13):
    print(i, len(data_2018[data_2018['MM'] == i]['DD'].value_counts()))

1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 26


<a id='Dec27-31,2018'></a>

In [107]:
# missing data, Dec. 27-31, 2018
data_2018.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018,12,22,260.352113,5.314789,6.111972,-0.233099,-3.452817
2018,12,23,249.084507,5.961972,6.719718,1.383099,-2.31831
2018,12,24,252.708333,7.311111,8.48125,-0.649306,-7.449306
2018,12,25,196.736111,4.321528,4.95,2.933333,-1.990972
2018,12,26,154.117647,1.503529,1.694118,3.534118,1.8


<a id='2019'></a>
### Exploring the data for 2019.

In [108]:
# data_19 = pd.read_csv('../data/buoy_CHII2/chii2h2019.csv')

### Combining all of the dataframes from 2015 - 2018 into one giant dataframe.

### Table of contents

Exploring the data:  
[2005](#2005), 
[2006](#2006), 
[2007](#2007),  
[2008](#2008)  
[2009](#2009)  
[2010](#2010)  
[2011](#2011)  
[2012](#2012)  
[2013](#2013)  
[2014](#2014), 
[2015](#2015),
[2016](#2016), 
[2017](#2017), 
[2018](#2018), 
[2019](#2019)  

#### Missing data
 - [Dec 5-13, 2015](#Dec5-13,2015)
 - [Nov. 30, 2016](#Nov30,2016)
 - [Dec 25, 2017](#Dec25,2017)
 - [Dec. 27-31, 2018](#Dec27-31,2018)

In [4]:
data = pd.concat([data_2015, data_2016, data_2017, data_2018], axis=0)
print(data.shape)
data.head()

NameError: name 'data_2015' is not defined

In [183]:
print(data.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).shape)
data.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm'], axis=1).head()

(1444, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP,DEWP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015,1,1,232.083333,10.191667,14.289583,-6.083333,999.0
2015,1,2,220.0,5.33125,6.541667,-2.022917,999.0
2015,1,3,168.229167,4.597917,5.357292,0.307292,999.0
2015,1,4,314.0625,7.702083,9.019792,-1.535417,999.0
2015,1,5,270.625,9.589583,12.18125,-15.46875,999.0


In [184]:
data.dtypes

#YY       int64
MM        int64
DD        int64
hh        int64
mm        int64
WDIR      int64
WSPD    float64
GST     float64
ATMP    float64
DEWP    float64
dtype: object

In [10]:
data = pd.read_csv('../data/buoy_CHII2/chii2_buoy_data_2015_2018.csv')

In [11]:
data.to_csv('../data/buoy_CHII2/chii2_buoy_data_2015_2018.csv', index=False)

In [12]:
data = pd.read_csv('../data/buoy_CHII2/chii2_buoy_data_2015_2018.csv')
print(data.shape)
data.head()

(186896, 12)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,0,0,2015,1,1,0,0,220,10.8,13.9,-7.2,999.0
1,1,1,2015,1,1,0,15,220,11.8,15.4,-7.2,999.0
2,2,2,2015,1,1,0,30,220,11.8,17.0,-7.1,999.0
3,3,3,2015,1,1,0,45,220,12.4,17.0,-6.9,999.0
4,4,4,2015,1,1,1,0,220,12.9,19.0,-7.0,999.0


In [14]:
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [15]:
data.to_csv('../data/buoy_CHII2/chii2_buoy_data_2015_2018.csv', index=False)

In [16]:
data = pd.read_csv('../data/buoy_CHII2/chii2_buoy_data_2015_2018.csv')
print(data.shape)
data.head()

(186896, 10)


Unnamed: 0,#YY,MM,DD,hh,mm,WDIR,WSPD,GST,ATMP,DEWP
0,2015,1,1,0,0,220,10.8,13.9,-7.2,999.0
1,2015,1,1,0,15,220,11.8,15.4,-7.2,999.0
2,2015,1,1,0,30,220,11.8,17.0,-7.1,999.0
3,2015,1,1,0,45,220,12.4,17.0,-6.9,999.0
4,2015,1,1,1,0,220,12.9,19.0,-7.0,999.0


In [24]:
grouped_data = data.groupby(['#YY', 'MM', 'DD']).mean().drop(['hh', 'mm', 'DEWP'], axis=1)

In [26]:
print(grouped_data.shape)
grouped_data.head()

(1444, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WDIR,WSPD,GST,ATMP
#YY,MM,DD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015,1,1,232.083333,10.191667,14.289583,-6.083333
2015,1,2,220.0,5.33125,6.541667,-2.022917
2015,1,3,168.229167,4.597917,5.357292,0.307292
2015,1,4,314.0625,7.702083,9.019792,-1.535417
2015,1,5,270.625,9.589583,12.18125,-15.46875
