In [2]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import folium
import bokeh

In [24]:
def getUniqueStations(fileName):
    with open(fileName) as data:
        loadedData = data.readlines()
        uniqueNumbers = sorted(list(set(list([stn[2:5] for stn in loadedData[100:]]))))
    return uniqueNumbers

In [25]:
uniqueStationNumbers = getUniqueStations(r'D:\git\pandas-bokeh\data\KNMI_20161227.txt')
print uniqueStationNumbers

['209', '210', '215', '225', '235', '240', '242', '248', '249', '251', '257', '258', '260', '265', '267', '269', '270', '273', '275', '277', '278', '279', '280', '283', '285', '286', '290', '308', '310', '311', '312', '313', '315', '316', '319', '323', '324', '330', '331', '340', '343', '344', '348', '350', '356', '370', '375', '377', '380', '391']


In [26]:
def loadWeatherData(fileName,stationNumbers):
    
    weatherHeaderList = []
    weatherDict = {}
    spatialHeaderList = []
    spatialDict = {}
    
    def clearWhitespace(element, charactersToRemove):
        charactersInElement = [character for character in list(element) if character not in charactersToRemove]
        cleanedElement = ''.join(charactersInElement)
        return cleanedElement
    
    def cleanHeader(headerLine, headerList, headerDict, separator):
        selectedHeaderLine = headerLine
        splitHeaderList = selectedHeaderLine.split(separator)
        #Determine headers in the main file (operations in order: remove hashtag, append main body, remove trailing line break)
        uncleanedHeaderList = [splitHeaderList[0][2:]]\
                            + splitHeaderList[1:-1] \
                            + [splitHeaderList[-1][:-1]]
        for element in uncleanedHeaderList:
            cleanedHeader = clearWhitespace(element, (' '))
            #Ensure that only valid headers are added (empty = not appended)
            if len(cleanedHeader) > 0:
                headerList.append(cleanedHeader)
                headerDict[cleanedHeader] = []
    
    with open(fileName) as data:
        loadedData = data.readlines()
        
        '''Process weather data header'''
        cleanHeader(loadedData[97], weatherHeaderList, weatherDict, ',')     
        
        '''Process main weather data'''
        uncleanedWeatherData = [line for line in loadedData[100:] if line[2:5] in stationNumbers]#stationNumber]
        weatherData = [line.split(',') for line in uncleanedWeatherData[:]]        
        
        for lines in weatherData:
            for elementNumber, elements in enumerate(lines):
                cleanedElement = clearWhitespace(elements,(' ', '\n'))
                #Error handling required to prevent unexpected EOF while parsing - no known alternatives         
                try:
                    evaluatedValue = eval(cleanedElement)
                    weatherDict[weatherHeaderList[elementNumber]].append(evaluatedValue)                    
                except:
                    weatherDict[weatherHeaderList[elementNumber]].append(cleanedElement)
        
        '''Process spatail data headers'''   
        cleanHeader(loadedData[4], spatialHeaderList, spatialDict, ' ')
        
        '''Process main spatial data'''
        uncleanedSpatialData = loadedData[5:54]
        #Splitting spatial data lines
        splitSpatialData = [line.split() for line in uncleanedSpatialData[:]]    
        spatialData = list(map(lambda values: values[1:], splitSpatialData))
        #Remove trailing colon after first element
        for lines in spatialData:
            lines[0] = lines[0][:-1]
            while len(lines) > len(spatialHeaderList):
                lines[len(spatialHeaderList)-1] = str(lines[len(spatialHeaderList)-1]) + ' ' + str(lines[len(spatialHeaderList)])
                del lines[len(spatialHeaderList)]
            for elementNumber, element in enumerate(lines):
                try:
                    spatialDict[spatialHeaderList[elementNumber]].append(eval(element))
                except:
                    spatialDict[spatialHeaderList[elementNumber]].append(element)
        
        '''Combine header files'''
        headers = weatherHeaderList[:] + spatialHeaderList[:]
            
    return weatherDict, spatialDict, headers

In [27]:
#Loading time is approx 9 seconds for two stations selecting the full list of records but rises quickly with more stations
#It does however significantly reduce the overall loading time as pandas does not handle large dataframes efficiently
#It is advised to not load more than a couple of stations at once, e.g. by looping
data = loadWeatherData(r'D:\git\pandas-bokeh\data\KNMI_20161227.txt',('209'))

In [28]:
#Joining the two datasets together
weatherDF = pd.DataFrame(data[0])[1:].apply(pd.to_numeric)
spatialDF = pd.DataFrame(data[1])

#Runtime is about 5mins
performSQL = lambda q: sqldf(q, globals())

sql = """
        SELECT * FROM weatherDF
        JOIN spatialDF ON spatialDF.stn = weatherDF.stn;
      """

weatherAndSpatialDataDF = performSQL(sql)

In [29]:
#Select unique station names
sql = """
        SELECT DISTINCT(STN) FROM weatherAndSpatialDataDF
      """
#List index -1 is to filter a trailing empty record that is returned with this syntax
uniqueStationNumbers = performSQL(sql).to_csv(None, header=False, index=False).split('\n')[:-1]
print uniqueStationNumbers

['209']


In [30]:
#Runtime for an avg selection takes about 5min per marker
#Plots graphs into folium markers, then adds markers to map & loads map inline
#evening_map = folium.Map(location=[52.092560, 5.109378],zoom_start=13)

for number in uniqueStationNumbers:
    sql = """
        SELECT avg(FG) FROM weatherAndSpatialDataDF
        WHERE STN = {stationNumber}
      """.format(stationNumber = number)
    maxWindSpeedList = performSQL(sql).to_csv(None, header=False, index=False).split('\n')[:-1]
    print maxWindSpeedList
    
    sql = """
        SELECT "LON(east)", "LAT(north)", "NAME" FROM weatherAndSpatialDataDF
        WHERE STN = {stationNumber}
        GROUP BY NAME
      """.format(stationNumber = number)
    stationLocationAndName = performSQL(sql).to_csv(None, header=False, index=False).split('\n')[:-1]
    stationLocationAndName = stationLocationAndName[0].split(',')
    
    #Specify path to load figures from
    #url = r"http://localhost:8888/files/UtrechtTraffic/utrecht/git/plots/{}.png".format(code)
    #graph ='<img src="{}">'.format(url)

    #http://gis.stackexchange.com/questions/185897/how-can-i-include-html-in-a-folium-marker-popup
    #
    #marker.apply(lambda row: folium.Marker([row['latitude'], row['longitude']],\
    #                        popup=folium.Popup(folium.element.IFrame(html=graph,
    #                        width=550, height=500),\
    #                        max_width=550))\
    #                        .add_to(evening_map), axis =1) 
#evening_map.save('evening.html')
#evening_map

['73.1510370502']
