In [1]:
import pandas as pd

In [7]:
def preprocess(filename):
    """Preprocess NCDC weather data"""
    
    fields = ['STN', 'WBAN', 'YEARMODA', 'TEMP', 'TEMP_count', 'DEWP', 'DEWP_count', 'SLP', 'SLP_count', 'STP', 'STP_count', 'VISIB', 'VISIB_count', 'WDSP', 'WDSP_count', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'FRSHTT']
    
    df = pd.read_csv(filename, 
                  sep=r'\s+', 
                  names=fields, 
                  header=0, 
                  parse_dates=['YEARMODA'], 
                  na_values={'TEMP':[9999.9], 
                             'DEWP':[9999.9], 
                             'SLP':[9999.9], 
                             'STP':[9999.9], 
                             'VISIB':[999.9], 
                             'WDSP':[999.9], 
                             'MXSPD':[999.9], 
                             'GUST':[999.9], 
                             'MAX':['9999.9'], # doesn't matter whether float or str
                             'MIN':['9999.9'], 
                             'PRCP':['99.99'],
                             'SNDP':[999.9]}
                 )
    
    flagged = df.copy()

    def strip_flag(x):
        if type(x) is float:
            return x
        elif type(x) is str:
            return float(x[:-1]) if '*' in x else float(x)
    def extract_flag(x):
        if type(x) is float:
            return False
        elif type(x) is str:
            return True if '*' in x else False
    
    flagged['MAX'] = df['MAX'].map(strip_flag)
    flagged['MAX_flag'] = df['MAX'].map(extract_flag)
    flagged['MIN'] = df['MIN'].map(strip_flag)
    flagged['MIN_flag'] = df['MIN'].map(extract_flag)
    
    flagged['PRCP'] = df['PRCP'].map(lambda x: float(x[:-1]) if type(x) is str else x)
    PRCP_flag = df['PRCP'].map(lambda x: x[-1] if type(x) is str else x)
    PRCP_dummies = pd.get_dummies(PRCP_flag).add_prefix('PRCP_')
    preprocessed = flagged.join(PRCP_dummies)
    
    return preprocessed

In [9]:
dfs = []
dfs.append(preprocess('../data/raw/CA/CA_1981-1985.txt'))
dfs.append(preprocess('../data/raw/CA/CA_1985-1989.txt'))
dfs.append(pd.read_csv('../data/raw/CA/CA_1989-1993.txt'))
dfs.append(pd.read_csv('../data/raw/CA/CA_1993-1997.txt'))
dfs.append(pd.read_csv('../data/raw/CA/CA_1997-2001.txt'))
dfs.append(pd.read_csv('../data/raw/CA/CA_2001-2005.txt'))
dfs.append(pd.read_csv('../data/raw/CA/CA_2005-2009.txt'))
dfs.append(pd.read_csv('../data/raw/CA/CA_2009-2015.txt'))