## Cleaning the weather data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

% matplotlib inline

### First, read the data, remove the unnecessary columns, and make the date column a date.

In [2]:
w_raw = pd.read_csv('./Weather_4.csv')

In [3]:
w_raw.columns.values

array(['STATION', 'STATION_NAME', 'DATE', 'PRCP', 'SNWD', 'SNOW', 'TAVG',
       'TMAX', 'TMIN', 'WESD', 'AWND', 'WDF2', 'WDF5', 'WSF2', 'WSF5',
       'PGTM', 'FMTM', 'WT09', 'WT14', 'WT07', 'WT01', 'WT15', 'WT17',
       'WT06', 'WT21', 'WT05', 'WT02', 'WT11', 'WT22', 'WT04', 'WT13',
       'WT16', 'WT08', 'WT18', 'WT03', 'WT19'], dtype=object)

In [4]:
w_raw2 = w_raw[['DATE','PRCP','SNOW','TAVG','TMAX','TMIN','AWND']]

In [5]:
w_raw2.head(5)

Unnamed: 0,DATE,PRCP,SNOW,TAVG,TMAX,TMIN,AWND
0,20100101,0.0,0.0,-9999,16,5,10.3
1,20100102,0.0,0.0,-9999,11,2,11.9
2,20100103,0.0,0.0,-9999,18,-1,10.3
3,20100104,0.0,0.0,-9999,18,7,11.4
4,20100105,0.0,0.0,-9999,23,13,8.5


In [6]:
w_raw2['date'] = pd.to_datetime(w_raw2.DATE,format='%Y%m%d')
w_raw2['month'] = w_raw2.date.map(lambda x: x.month)
w_raw2['year'] = w_raw2.date.map(lambda x: x.year)
w_raw2['day'] = w_raw2.date.map(lambda x: x.day)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [7]:
w_raw2.head(5)

Unnamed: 0,DATE,PRCP,SNOW,TAVG,TMAX,TMIN,AWND,date,month,year,day
0,20100101,0.0,0.0,-9999,16,5,10.3,2010-01-01,1,2010,1
1,20100102,0.0,0.0,-9999,11,2,11.9,2010-01-02,1,2010,2
2,20100103,0.0,0.0,-9999,18,-1,10.3,2010-01-03,1,2010,3
3,20100104,0.0,0.0,-9999,18,7,11.4,2010-01-04,1,2010,4
4,20100105,0.0,0.0,-9999,23,13,8.5,2010-01-05,1,2010,5


In [8]:
w_raw2 = w_raw2.drop(['DATE'],axis=1)
w_raw2.head(5)

Unnamed: 0,PRCP,SNOW,TAVG,TMAX,TMIN,AWND,date,month,year,day
0,0.0,0.0,-9999,16,5,10.3,2010-01-01,1,2010,1
1,0.0,0.0,-9999,11,2,11.9,2010-01-02,1,2010,2
2,0.0,0.0,-9999,18,-1,10.3,2010-01-03,1,2010,3
3,0.0,0.0,-9999,18,7,11.4,2010-01-04,1,2010,4
4,0.0,0.0,-9999,23,13,8.5,2010-01-05,1,2010,5


In [9]:
w_raw2.columns = [w.lower() for w in w_raw2.columns]

In [10]:
w_raw2.head(5)

Unnamed: 0,prcp,snow,tavg,tmax,tmin,awnd,date,month,year,day
0,0.0,0.0,-9999,16,5,10.3,2010-01-01,1,2010,1
1,0.0,0.0,-9999,11,2,11.9,2010-01-02,1,2010,2
2,0.0,0.0,-9999,18,-1,10.3,2010-01-03,1,2010,3
3,0.0,0.0,-9999,18,7,11.4,2010-01-04,1,2010,4
4,0.0,0.0,-9999,23,13,8.5,2010-01-05,1,2010,5


In [11]:
w = w_raw2

### Next, remove the -9999 code for missing values and make them nulls

In [12]:
w[w==-9999] = np.nan

In [13]:
(w==-9999).sum()

prcp     0
snow     0
tavg     0
tmax     0
tmin     0
awnd     0
date     0
month    0
year     0
day      0
dtype: int64

In [14]:
w.head(5)

Unnamed: 0,prcp,snow,tavg,tmax,tmin,awnd,date,month,year,day
0,0.0,0.0,,16.0,5.0,10.3,2010-01-01,1,2010,1
1,0.0,0.0,,11.0,2.0,11.9,2010-01-02,1,2010,2
2,0.0,0.0,,18.0,-1.0,10.3,2010-01-03,1,2010,3
3,0.0,0.0,,18.0,7.0,11.4,2010-01-04,1,2010,4
4,0.0,0.0,,23.0,13.0,8.5,2010-01-05,1,2010,5


In [15]:
w.isnull().sum()

prcp        2
snow        2
tavg     1186
tmax        2
tmin        2
awnd        2
date        0
month       0
year        0
day         0
dtype: int64

### Save the pandas dataframe

In [16]:
w.to_csv('Weather_clean.csv',index=False)

In [20]:
w.year.dtype

dtype('int64')