# 1-Data Engineering

In [3]:
# import dependencies
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt

In [4]:
# Csv data to dataframes
stations = "Resources/hawaii_stations.csv"
weather = "Resources/hawaii_measurements.csv"
stations_df = pd.read_csv(stations)
weather_df = pd.read_csv(weather)

In [5]:
# check data
stations_df.count()

station      9
name         9
latitude     9
longitude    9
elevation    9
dtype: int64

In [6]:
#display data
stations_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [7]:
# check if missing data
weather_df.count()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [8]:
# check properties
weather_df.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [9]:
#display data - NaN data exist
weather_df.head(10)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73
5,USC00519397,2010-01-07,0.06,70
6,USC00519397,2010-01-08,0.0,64
7,USC00519397,2010-01-09,0.0,68
8,USC00519397,2010-01-10,0.0,73
9,USC00519397,2010-01-11,0.01,64


In [10]:
nan_data = weather_df
nan_data['NaN Count']=nan_data['prcp'].isnull()
nan_data

Unnamed: 0,station,date,prcp,tobs,NaN Count
0,USC00519397,2010-01-01,0.08,65,False
1,USC00519397,2010-01-02,0.00,63,False
2,USC00519397,2010-01-03,0.00,74,False
3,USC00519397,2010-01-04,0.00,76,False
4,USC00519397,2010-01-06,,73,True
5,USC00519397,2010-01-07,0.06,70,False
6,USC00519397,2010-01-08,0.00,64,False
7,USC00519397,2010-01-09,0.00,68,False
8,USC00519397,2010-01-10,0.00,73,False
9,USC00519397,2010-01-11,0.01,64,False


In [11]:
# sum the Nan Values to figure how many lines will drop
sum(pd.isnull(nan_data['prcp']))

1447

In [12]:
# Group by Station
group_station = stations_df.groupby("station")
group_station

<pandas.core.groupby.DataFrameGroupBy object at 0x000001B6C35FD198>

In [13]:
# Use pivot table to look at the average monthly data -  https://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html
nan_data = pd.pivot_table(nan_data,index=['station'], values=['prcp','tobs'])
nan_data

Unnamed: 0_level_0,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1
USC00511918,0.047971,71.615968
USC00513117,0.141921,72.689184
USC00514830,0.121058,74.873297
USC00516128,0.429988,70.915008
USC00517948,0.063602,74.684402
USC00518838,0.207222,72.72407
USC00519281,0.212352,71.663781
USC00519397,0.04902,74.553231
USC00519523,0.114961,74.543649


In [14]:
clean_stations_df=pd.DataFrame(stations_df)
clean_weather_df=pd.DataFrame(weather_df)

# drop NaN values
clean_weather_df.drop('NaN Count', axis=1, inplace=True)
clean_weather_df = clean_weather_df.fillna(0)

#export cleaned csv files
clean_weather_df.to_csv("clean_hawaii_measurements.csv", index = False)
clean_stations_df.to_csv("clean_hawaii_stations.csv", index = False)

In [15]:
clean_weather_df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,0.00,73
5,USC00519397,2010-01-07,0.06,70
6,USC00519397,2010-01-08,0.00,64
7,USC00519397,2010-01-09,0.00,68
8,USC00519397,2010-01-10,0.00,73
9,USC00519397,2010-01-11,0.01,64


In [17]:
clean_stations_df

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4
