## Import Package

In [1]:
import pandas as pd
from pandas import DataFrame, read_csv
import numpy as np
import datetime as dt
import scipy
import math
# We'll also import seaborn, a Python graphing library
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

## Access Dataset

In [2]:
fname = 'https://www.ncei.noaa.gov/orders/cdo/993891.csv'
data = pd.read_csv(fname, sep = ',') 
data

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,REPORTTPYE,HOURLYSKYCONDITIONS,HOURLYVISIBILITY,HOURLYPRSENTWEATHERTYPE,...,MonthlyMaxSeaLevelPressureTime,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureDate,MonthlyMinSeaLevelPressureTime,MonthlyTotalHeatingDegreeDays,MonthlyTotalCoolingDegreeDays,MonthlyDeptFromNormalHeatingDD,MonthlyDeptFromNormalCoolingDD,MonthlyTotalSeasonToDateHeatingDD,MonthlyTotalSeasonToDateCoolingDD
0,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 15:55,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
1,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:15,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
2,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:35,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
3,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:15,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
4,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:35,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
5,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:55,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
6,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 20:15,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
7,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 20:35,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
8,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 20:55,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,
9,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 21:15,FM-15,CLR:00,10.00,,...,-9999,,-9999,-9999,,,,,,


## Exploratory Data Analysis (original dataset)

In [3]:
data['HOURLYVISIBILITY'].mean()

TypeError: Can't convert 'int' object to str implicitly

## Data Wrangling

In [4]:
dataCleaned = data
dataCleaned.head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,REPORTTPYE,HOURLYSKYCONDITIONS,HOURLYVISIBILITY,HOURLYPRSENTWEATHERTYPE,...,MonthlyMaxSeaLevelPressureTime,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureDate,MonthlyMinSeaLevelPressureTime,MonthlyTotalHeatingDegreeDays,MonthlyTotalCoolingDegreeDays,MonthlyDeptFromNormalHeatingDD,MonthlyDeptFromNormalCoolingDD,MonthlyTotalSeasonToDateHeatingDD,MonthlyTotalSeasonToDateCoolingDD
0,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 15:55,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
1,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:15,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
2,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:35,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
3,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:15,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
4,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:35,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,


### Remove SOD row

In [5]:
dataCleaned = DataFrame(dataCleaned[dataCleaned['REPORTTPYE'] != 'SOD'])
dataCleaned.head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,REPORTTPYE,HOURLYSKYCONDITIONS,HOURLYVISIBILITY,HOURLYPRSENTWEATHERTYPE,...,MonthlyMaxSeaLevelPressureTime,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureDate,MonthlyMinSeaLevelPressureTime,MonthlyTotalHeatingDegreeDays,MonthlyTotalCoolingDegreeDays,MonthlyDeptFromNormalHeatingDD,MonthlyDeptFromNormalCoolingDD,MonthlyTotalSeasonToDateHeatingDD,MonthlyTotalSeasonToDateCoolingDD
0,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 15:55,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
1,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:15,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
2,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:35,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
3,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:15,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,
4,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:35,FM-15,CLR:00,10.0,,...,-9999,,-9999,-9999,,,,,,


### Convert string into float

In [6]:
for value in dataCleaned['HOURLYDRYBULBTEMPF']:
    if type(value) is str:
        dataCleaned.ix[dataCleaned.HOURLYDRYBULBTEMPF == value, 'HOURLYDRYBULBTEMPF'] = float(value.replace('s', ''))

In [7]:
for value in dataCleaned['HOURLYVISIBILITY']:
    if type(value) is str:
        s = value
        value = value.replace('s', '')
        value = value.replace('*', '0')
        value = value.replace('V', '')
        try:
            dataCleaned.ix[dataCleaned.HOURLYVISIBILITY == s, 'HOURLYVISIBILITY'] = float(value)
        except ValueError:
            print(s)

In [8]:
for value in dataCleaned['HOURLYPrecip']:
    if type(value) is str:
        s = value
        value = value.replace('s', '')
        value = value.replace('*', '0')
        value = value.replace('V', '')
        try:
            dataCleaned.ix[dataCleaned.HOURLYPrecip == s, 'HOURLYPrecip'] = float(value)
        except ValueError:
            print(s)

### Transfer datetype from string into datetime

In [9]:
dataCleaned['DATE'] = pd.to_datetime(dataCleaned['DATE'], coerce = False)
print(type(data['DATE']))
#data['DATE'] = data['DATE'].map(lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M'))
#data.loc[:1, 'DATE']

<class 'pandas.core.series.Series'>


### Add Extra Columns (DAY, MONTH, YEAR)

In [10]:
dataCleaned['DAY'] = dataCleaned['DATE'].map(lambda a : a.date())
dataCleaned['MONTH'] = dataCleaned['DATE'].map(lambda a : int(a.month))
dataCleaned['YEAR'] = dataCleaned['DATE'].map(lambda a : int(a.year))

dataCleaned.head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,REPORTTPYE,HOURLYSKYCONDITIONS,HOURLYVISIBILITY,HOURLYPRSENTWEATHERTYPE,...,MonthlyMinSeaLevelPressureTime,MonthlyTotalHeatingDegreeDays,MonthlyTotalCoolingDegreeDays,MonthlyDeptFromNormalHeatingDD,MonthlyDeptFromNormalCoolingDD,MonthlyTotalSeasonToDateHeatingDD,MonthlyTotalSeasonToDateCoolingDD,DAY,MONTH,YEAR
0,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 15:55:00,FM-15,CLR:00,10,,...,-9999,,,,,,,2014-07-31,7,2014
1,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:15:00,FM-15,CLR:00,10,,...,-9999,,,,,,,2014-07-31,7,2014
2,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 16:35:00,FM-15,CLR:00,10,,...,-9999,,,,,,,2014-07-31,7,2014
3,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:15:00,FM-15,CLR:00,10,,...,-9999,,,,,,,2014-07-31,7,2014
4,WBAN:00169,CHESTER AIRPORT CT US,127.1,41.384,-72.506,2014-07-31 19:35:00,FM-15,CLR:00,10,,...,-9999,,,,,,,2014-07-31,7,2014


## Save Cleaned Data

In [27]:
dataCleaned.to_csv("Cleaned_Data.csv", sep=',', encoding='utf-8')