In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import requests
import time
import urllib
import datetime as dt

In [2]:
Measurements_path = "Resources/hawaii_measurements.csv"
Stations_path = "Resources/hawaii_stations.csv"
#Reading data
Measurement_df = pd.read_csv(Measurements_path, encoding="utf-8")
Stations_df = pd.read_csv(Stations_path, encoding="utf-8")

In [3]:
Measurement_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
Stations_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [5]:
Measurement_df.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [6]:
Stations_df.describe()

Unnamed: 0,latitude,longitude,elevation
count,9.0,9.0,9.0
mean,21.393826,-157.867098,60.977778
std,0.086442,0.103873,103.465547
min,21.2716,-158.0111,0.9
25%,21.3331,-157.9751,7.0
50%,21.3934,-157.8374,14.6
75%,21.45167,-157.8025,32.9
max,21.5213,-157.71139,306.6


In [7]:
Stations_df.isnull().any()

station      False
name         False
latitude     False
longitude    False
elevation    False
dtype: bool

In [8]:
Measurement_df.isnull().any()

station    False
date       False
prcp        True
tobs       False
dtype: bool

In [9]:
Measurement_df = Measurement_df.dropna(how='any')

In [10]:
Measurement_df.isnull().any()

station    False
date       False
prcp       False
tobs       False
dtype: bool

In [11]:
surfsup_merge = pd.merge(Measurement_df, Stations_df, how="outer")
surfsup_merge.head()

Unnamed: 0,station,date,prcp,tobs,name,latitude,longitude,elevation
0,USC00519397,2010-01-01,0.08,65,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00519397,2010-01-02,0.0,63,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
2,USC00519397,2010-01-03,0.0,74,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
3,USC00519397,2010-01-04,0.0,76,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
4,USC00519397,2010-01-07,0.06,70,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0


In [12]:
surfsup_merge = surfsup_merge.dropna(how='any')

In [13]:
surfsup_merge.isnull().any()

station      False
date         False
prcp         False
tobs         False
name         False
latitude     False
longitude    False
elevation    False
dtype: bool

In [14]:
Measurement_df['M_ID'] = range(1, len(Measurement_df) + 1)
Measurement_df.head()

Unnamed: 0,station,date,prcp,tobs,M_ID
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.0,63,2
2,USC00519397,2010-01-03,0.0,74,3
3,USC00519397,2010-01-04,0.0,76,4
5,USC00519397,2010-01-07,0.06,70,5


In [15]:
cols = Measurement_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
Measurement_df = Measurement_df[cols]
Measurement_df.head()

Unnamed: 0,M_ID,station,date,prcp,tobs
0,1,USC00519397,2010-01-01,0.08,65
1,2,USC00519397,2010-01-02,0.0,63
2,3,USC00519397,2010-01-03,0.0,74
3,4,USC00519397,2010-01-04,0.0,76
5,5,USC00519397,2010-01-07,0.06,70


In [16]:
Stations_df['S_ID'] = range(1, len(Stations_df) + 1)
Stations_df

Unnamed: 0,station,name,latitude,longitude,elevation,S_ID
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,1
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6,2
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0,3
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9,4
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6,5
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5,6
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9,7
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,8
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4,9


In [17]:
Stations_df['S_ID'] = 'S' + Stations_df['S_ID'].astype(str)
Stations_df

Unnamed: 0,station,name,latitude,longitude,elevation,S_ID
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,S1
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6,S2
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0,S3
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9,S4
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6,S5
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5,S6
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9,S7
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,S8
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4,S9


In [18]:
cols = Stations_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
Stations_df = Stations_df[cols]
Stations_df.head()

Unnamed: 0,S_ID,station,name,latitude,longitude,elevation
0,S1,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,S2,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,S3,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,S4,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,S5,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [19]:
surfsup_merge['ID'] = range(1, len(surfsup_merge) + 1)
surfsup_merge.head()

Unnamed: 0,station,date,prcp,tobs,name,latitude,longitude,elevation,ID
0,USC00519397,2010-01-01,0.08,65,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,1
1,USC00519397,2010-01-02,0.0,63,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,2
2,USC00519397,2010-01-03,0.0,74,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,3
3,USC00519397,2010-01-04,0.0,76,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,4
4,USC00519397,2010-01-07,0.06,70,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0,5


In [20]:
surfsup_merge.to_csv("Output/Clean_SurfMerge.csv", index=False, header=True)
Measurement_df.to_csv("Output/Clean_Measurement.csv", index=False, header=True)
Stations_df.to_csv("Output/Clean_Station.csv", index=False, header=True)