In [3]:
import pandas as pd
'''
Selected dataset:
Shunyi(顺义)，Changping(昌平)，Huairou(怀柔)，Aotizhongxin(奥体)，Tiantan(天坛)
'''
# do some simple data cleaning
# adjust datetime format and drop the columns



def padding(x):
    """Padding function for month/day/hour (e.g. 1 -> 01)
    :param x:original month/day (single/double digit, int)
    :return string of month/day (double digit)
    """
    return str(x) if x >= 10 else '0' + str(x)

def cleanup(loc):
    '''Simple data clean up function,turn time information into yyyy-mm-dd HH:MM:SS 
    and drop some columns
    :param loc:location string
    '''
    file = 'data/PRSA_Data_' + loc + '_20130301-20170228.csv'
    # load into dataframe
    data = pd.read_csv(file, index_col=['No'])
    # add date columns according to year month day hour
    data['date'] = data.apply(lambda x: str(x.year)+'-'+padding(x.month)+'-'+padding(x.day)+' '+padding(x.hour)+':00:00', axis=1)
    # drop the columns
    data.drop(columns=['year', 'month', 'day', 'hour', 'station'], inplace=True)
    # convert the date string to datetime 
    data['date'] = pd.to_datetime(data['date'])
    # resetindex
    data.set_index(["date"], inplace=True)
    new_fileName = 'data/cleanup/' + loc + '.csv'
    # save data
    data.to_csv(new_fileName)


In [6]:
locations = ['Shunyi', 'Changping', 'Huairou', 'Aotizhongxin', 'Tiantan']
for loc in locations:
    cleanup(loc)

In [7]:
# validate

pd.read_csv('data/cleanup/Changping.csv')

Unnamed: 0,date,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
0,2013-03-01 00:00:00,3.0,6.0,13.0,7.0,300.0,85.0,-2.3,1020.8,-19.7,0.0,E,0.5
1,2013-03-01 01:00:00,3.0,3.0,6.0,6.0,300.0,85.0,-2.5,1021.3,-19.0,0.0,ENE,0.7
2,2013-03-01 02:00:00,3.0,3.0,22.0,13.0,400.0,74.0,-3.0,1021.3,-19.9,0.0,ENE,0.2
3,2013-03-01 03:00:00,3.0,6.0,12.0,8.0,300.0,81.0,-3.6,1021.8,-19.1,0.0,NNE,1.0
4,2013-03-01 04:00:00,3.0,3.0,14.0,8.0,300.0,81.0,-3.5,1022.3,-19.4,0.0,N,2.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,2017-02-28 19:00:00,28.0,47.0,4.0,14.0,300.0,,11.7,1008.9,-13.3,0.0,NNE,1.3
35060,2017-02-28 20:00:00,12.0,12.0,3.0,23.0,500.0,64.0,10.9,1009.0,-14.0,0.0,N,2.1
35061,2017-02-28 21:00:00,7.0,23.0,5.0,17.0,500.0,68.0,9.5,1009.4,-13.0,0.0,N,1.5
35062,2017-02-28 22:00:00,11.0,20.0,3.0,15.0,500.0,72.0,7.8,1009.6,-12.6,0.0,NW,1.4
