## Cleaning the daily temperature data for Ohare airport for the years 2000-2019.

### Data source

https://www.ncdc.noaa.gov/cdo-web/search

### Data in the final dataset

- TAVG - Average Temperature
- TMAX - Maximum Temperature
- TMIN - Minimum Temperature
- AVG - average between the maximum and the minimum daily values
- error - TAVG - AVG
- LOC - Location

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Here we start with the full temp data from Ohare.

df100 = pd.read_csv('../data/temp_data/Ohare_FULL_temp_2000_2019.csv')
df100.shape

(7046, 78)

In [3]:
df100.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,AWND_ATTRIBUTES,FMTM,FMTM_ATTRIBUTES,...,WT19,WT19_ATTRIBUTES,WT21,WT21_ATTRIBUTES,WT22,WT22_ATTRIBUTES,WV03,WV03_ATTRIBUTES,WV20,WV20_ATTRIBUTES
0,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-01,7.38,",,X",148.0,",,X",...,,,,,,,,,,
1,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-02,13.87,",,X",1537.0,",,X",...,,,,,,,,,,
2,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-03,12.75,",,X",1828.0,",,X",...,,,,,,,,,,
3,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-04,13.42,",,X",227.0,",,X",...,,,,,,,,,,
4,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-05,11.18,",,X",2017.0,",,X",...,,,,,,,,,,


In [4]:
df100.tail()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,AWND_ATTRIBUTES,FMTM,FMTM_ATTRIBUTES,...,WT19,WT19_ATTRIBUTES,WT21,WT21_ATTRIBUTES,WT22,WT22_ATTRIBUTES,WV03,WV03_ATTRIBUTES,WV20,WV20_ATTRIBUTES
7041,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2019-04-12,16.55,",,W",,,...,,,,,,,,,,
7042,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2019-04-13,12.3,",,W",,,...,,,,,,,,,,
7043,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2019-04-14,18.12,",,W",,,...,,,,,,,,,,
7044,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2019-04-15,8.28,",,W",,,...,,,,,,,,,,
7045,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2019-04-16,,,,,...,,,,,,,,,,


In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 500

In [6]:
df100.head(5)

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,AWND_ATTRIBUTES,FMTM,FMTM_ATTRIBUTES,PGTM,PGTM_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNOW,SNOW_ATTRIBUTES,SNWD,SNWD_ATTRIBUTES,TAVG,TAVG_ATTRIBUTES,TMAX,TMAX_ATTRIBUTES,TMIN,TMIN_ATTRIBUTES,TSUN,TSUN_ATTRIBUTES,WDF2,WDF2_ATTRIBUTES,WDF5,WDF5_ATTRIBUTES,WESD,WESD_ATTRIBUTES,WSF2,WSF2_ATTRIBUTES,WSF5,WSF5_ATTRIBUTES,WT01,WT01_ATTRIBUTES,WT02,WT02_ATTRIBUTES,WT03,WT03_ATTRIBUTES,WT04,WT04_ATTRIBUTES,WT05,WT05_ATTRIBUTES,WT06,WT06_ATTRIBUTES,WT07,WT07_ATTRIBUTES,WT08,WT08_ATTRIBUTES,WT09,WT09_ATTRIBUTES,WT11,WT11_ATTRIBUTES,WT13,WT13_ATTRIBUTES,WT14,WT14_ATTRIBUTES,WT15,WT15_ATTRIBUTES,WT16,WT16_ATTRIBUTES,WT17,WT17_ATTRIBUTES,WT18,WT18_ATTRIBUTES,WT19,WT19_ATTRIBUTES,WT21,WT21_ATTRIBUTES,WT22,WT22_ATTRIBUTES,WV03,WV03_ATTRIBUTES,WV20,WV20_ATTRIBUTES
0,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-01,7.38,",,X",148.0,",,X",134.0,",,W",0.0,"T,,0,2400",0.0,",,0",0.0,",,0,",42.0,",,W",48.0,",,0",35.0,",,0",,,200.0,",,X",210.0,",,X",0.0,",,0",17.9,",,X",23.9,",,X",,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,",,0",,,,,,,,,,,,,,
1,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-02,13.87,",,X",1537.0,",,X",1536.0,",,W",0.01,",,0,2400",0.0,",,0",0.0,",,0,",48.0,",,W",60.0,",,0",35.0,",,0",,,240.0,",,X",230.0,",,X",0.0,",,0",25.9,",,X",36.9,",,X",1.0,",,0",,,,,,,,,,,,,,,,,,,1.0,",,X",1.0,",,X",,,1.0,",,0",,,,,,,,,,,,,,
2,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-03,12.75,",,X",1828.0,",,X",1248.0,",,W",0.25,",,0,2400",0.4,",,0",0.0,",,0,",35.0,",,W",38.0,",,0",32.0,",,0",,,30.0,",,X",80.0,",,X",0.0,",,0",17.9,",,X",25.1,",,X",1.0,",,0",,,,,,,,,,,,,,,,,,,1.0,",,X",1.0,",,X",,,1.0,",,0",,,1.0,",,0",,,,,,,,,,
3,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-04,13.42,",,X",227.0,",,X",210.0,",,W",0.0,"T,,0,2400",0.0,"T,,0",0.0,",,0,",28.0,",,W",33.0,",,0",23.0,",,0",,,320.0,",,X",300.0,",,X",0.0,",,0",21.9,",,X",29.1,",,X",1.0,",,0",,,,,,,,,,,,,,,,,,,1.0,",,X",,,,,,,,,1.0,",,0",,,,,,,,,,
4,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.995,-87.9336,201.8,2000-01-05,11.18,",,X",2017.0,",,X",2102.0,",,W",0.0,"T,,0,2400",0.0,"T,,0",0.0,",,0,",20.0,",,W",26.0,",,0",13.0,",,0",,,170.0,",,X",160.0,",,X",0.0,",,0",23.0,",,X",25.9,",,X",1.0,",,0",,,,,,,,,,,,,,,,,,,1.0,",,X",,,,,,,,,1.0,",,0",,,,,,,,,,


In [7]:
df100.isnull().sum()

STATION               0
NAME                  0
LATITUDE              0
LONGITUDE             0
ELEVATION             0
DATE                  0
AWND                  1
AWND_ATTRIBUTES       1
FMTM               2758
FMTM_ATTRIBUTES    2758
PGTM               2905
PGTM_ATTRIBUTES    2905
PRCP                  1
PRCP_ATTRIBUTES       1
SNOW                  1
SNOW_ATTRIBUTES       1
SNWD                 62
SNWD_ATTRIBUTES      62
TAVG               2803
TAVG_ATTRIBUTES    2803
TMAX                  1
TMAX_ATTRIBUTES       1
TMIN                  1
TMIN_ATTRIBUTES       1
TSUN               6293
TSUN_ATTRIBUTES    6293
WDF2                  1
WDF2_ATTRIBUTES       1
WDF5                 23
WDF5_ATTRIBUTES      23
WESD               3028
WESD_ATTRIBUTES    3028
WSF2                  1
WSF2_ATTRIBUTES       1
WSF5                 23
WSF5_ATTRIBUTES      23
WT01               4152
WT01_ATTRIBUTES    4152
WT02               6828
WT02_ATTRIBUTES    6828
WT03               6304
WT03_ATTRIBUTES 

In [8]:
df_Ohare = df100[['DATE', 'TAVG', 'TMAX', 'TMIN']]

In [9]:
df100[['DATE', 'TAVG', 'TMAX', 'TMIN']][df100['TAVG'].isnull() == True].head()

Unnamed: 0,DATE,TAVG,TMAX,TMIN
85,2000-03-26,,68.0,36.0
86,2000-03-27,,56.0,38.0
437,2001-03-13,,42.0,31.0
2039,2005-08-01,,93.0,69.0
2040,2005-08-02,,92.0,65.0


In [10]:
df100[['DATE', 'TAVG', 'TMAX', 'TMIN']][df100['TAVG'].isnull() == True].head()

Unnamed: 0,DATE,TAVG,TMAX,TMIN
85,2000-03-26,,68.0,36.0
86,2000-03-27,,56.0,38.0
437,2001-03-13,,42.0,31.0
2039,2005-08-01,,93.0,69.0
2040,2005-08-02,,92.0,65.0


In [11]:
df_Ohare['AVG'] = (df_Ohare['TMAX'] + df_Ohare['TMIN']) * 0.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
df_Ohare['error'] = df_Ohare['TAVG'] - df_Ohare['AVG']
df_Ohare.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,DATE,TAVG,TMAX,TMIN,AVG,error
0,2000-01-01,42.0,48.0,35.0,41.5,0.5
1,2000-01-02,48.0,60.0,35.0,47.5,0.5
2,2000-01-03,35.0,38.0,32.0,35.0,0.0
3,2000-01-04,28.0,33.0,23.0,28.0,0.0
4,2000-01-05,20.0,26.0,13.0,19.5,0.5


In [13]:
df_Ohare.dtypes

DATE      object
TAVG     float64
TMAX     float64
TMIN     float64
AVG      float64
error    float64
dtype: object

In [14]:
df_Ohare['LOC'] = 'ohare'
df_Ohare.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,DATE,TAVG,TMAX,TMIN,AVG,error,LOC
0,2000-01-01,42.0,48.0,35.0,41.5,0.5,ohare
1,2000-01-02,48.0,60.0,35.0,47.5,0.5,ohare
2,2000-01-03,35.0,38.0,32.0,35.0,0.0,ohare
3,2000-01-04,28.0,33.0,23.0,28.0,0.0,ohare
4,2000-01-05,20.0,26.0,13.0,19.5,0.5,ohare


In [15]:
# saving the dataframe
df_Ohare.to_csv('../data/temp_data/ohare_temp_2000_2019.csv', index=False)

In [16]:
df_Ohare['DATE'] = pd.to_datetime(df_Ohare['DATE'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
df_Ohare.dtypes

DATE     datetime64[ns]
TAVG            float64
TMAX            float64
TMIN            float64
AVG             float64
error           float64
LOC              object
dtype: object