In [1]:
# First, import our dependencies.  (Perhaps more than we will actually need, intially, etc)

import pandas as pd
import csv
import os
import numpy as np

#    **** MEASUREMENTS DATA SECTION !! ****

In [3]:
# ID the first/measurements file:

hawaii_measurements = "Resources\hawaii_measurements.csv"

In [4]:
# Reference / "read in" the measurements CSV datafiles (raw - UNCLEANED - ORIGINAL):

hawaii_measurements = pd.read_csv("Resources/hawaii_measurements.csv")
hawaii_measurements.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [6]:
# check some basic stats, to see if things look fairly normal:

hawaii_measurements.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [11]:
# Let's check the actual CSV sheet data "shape" to make sure it matches "as a whole" the dataframe head 
# ...from above.   Also to verify the correct number of columns, etc:

hawaii_measurements.shape

# The output of 4 columns matches the dataframe header.

(19550, 4)

In [13]:
# Checking for duplicates in the measurements DataFrame
hawaii_measurements.duplicated().sum()

0

In [34]:
# Now check for NULL values (making sure cells are filled):  We already saw 1 for prcp in the df above.

hawaii_measurements.isnull().sum()

station       0
date          0
prcp       1447
tobs          0
dtype: int64

In [19]:
# We see that the prcp column has some null values.  Let's remove:

# Removing the null values
hawaii_measurements = hawaii_measurements.dropna()

In [21]:
# After dropping the nulls, let's see how the shape of the data was affected:

# Rechecking the shape of the database
hawaii_measurements.shape

(18103, 4)

In [None]:
# It appears that (19550 - 18103) or 1447 null values were elimanted.   
#   **** This means that our new data set was reduced by about 7.4% of the observed values (rows).  
#  Based on this, I think it's ok to proceed with the analysis (dropping nulls) w/o bringing
# large shifts in our resulting analysis.  

# HOWEVER, let's test the above to be sure...................................

In [22]:
#  SINCE eliminating the null data sets will reduce the corresponding "tobs" data sets (rows),
# .. let's RE-RUN the basic summary stats to verify no massive shift in the MEAN values for tobs...

hawaii_measurements.describe()

Unnamed: 0,prcp,tobs
count,18103.0,18103.0
mean,0.160644,72.994863
std,0.468746,4.512107
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [None]:
#  From above, we see the "tobs" count falls by 1447 and the mean shifts from 73.09 (pre-drop dataframe)
# ... to 72.99 (above).   This shouldn't be too significant, so I think it's ok to elimnate the nulls.


In [23]:
# Now we created our new csv containing the "cleaned" data without null values in the prcp column:

hawaii_measurements.to_csv("Resources/clean_hawaii_measurements.csv", index=False)

#    **** STATIONS DATA SECTION !! ****

In [24]:
# ID the second/stations file:

hawaii_stations = "Resources\hawaii_stations.csv"

In [26]:
# Reference / "read in" the measurements CSV datafiles (raw - UNCLEANED - ORIGINAL):

hawaii_stations = pd.read_csv("Resources/hawaii_stations.csv")
hawaii_stations.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [29]:
# check some basic stats, to see if things look fairly normal:

hawaii_stations.describe()

Unnamed: 0,latitude,longitude,elevation
count,9.0,9.0,9.0
mean,21.393826,-157.867098,60.977778
std,0.086442,0.103873,103.465547
min,21.2716,-158.0111,0.9
25%,21.3331,-157.9751,7.0
50%,21.3934,-157.8374,14.6
75%,21.45167,-157.8025,32.9
max,21.5213,-157.71139,306.6


In [30]:
# Let's check the actual CSV sheet data "shape" to make sure it matches "as a whole" the dataframe head 
# ...from above.   Also to verify the correct number of columns, etc:

hawaii_stations.shape

# The output of 5 columns matches the dataframe header.

(9, 5)

In [31]:
# Checking for duplicates in the stations DataFrame
hawaii_stations.duplicated().sum()

0

In [32]:
# Now check for NULL values (making sure cells are filled):  

hawaii_stations.isnull().sum()

# NO Null datasets in the stations csv.

station      0
name         0
latitude     0
longitude    0
elevation    0
dtype: int64

In [35]:
# Since we don't have null values, we can go and create the clean csv:

# Creating a new csv for the cleaned data
hawaii_stations.to_csv("Resources/clean_hawaii_stations.csv", index=False)