Setup necessary modules

In [1]:
%use dataframe
%use kandy

Setup file columns and folders

In [2]:
import java.nio.file.Paths
import java.nio.file.Files
import java.io.BufferedInputStream
import org.apache.commons.csv.CSVFormat
import kotlin.io.path.inputStream

val date by column<Int>("DATE")
val count by column<Int>("count")
val tavg by column<Double>("TAVG")
val tmin by column<Double>("TMIN")
val tmax by column<Double>("TMAX")
val average by column<Double>()
val max by column<Double>()
val min by column<Double>()

val name by column<String>("NAME")
val latitude by column<Double>("LATITUDE")
val longitude by column<Double>("LONGITUDE")
val elevation by column<Double>("ELEVATION")
val snow by column<Double>("SNOW")

//hacky
val projectFolder = Paths.get(System.getProperty("user.dir").replace("/src/main/kotlin/org/data/first/","")).normalize()
println(projectFolder)
val rawDataFolder = projectFolder.resolve("raw-data")
println(rawDataFolder)
val rawDataYearlyDataFolder = rawDataFolder.resolve("yearly-weather")
println(rawDataYearlyDataFolder)
val refinedDataFolder = projectFolder.resolve("refined-data")
println(refinedDataFolder)
val refinedDataYearlyDataFolder = refinedDataFolder.resolve("yearly-data")
println(refinedDataYearlyDataFolder)
val presentableDataFolder = projectFolder.resolve("presentable-data")
println(presentableDataFolder)

val highElevationStart = 2000.0

/Users/jsrnicek/IdeaProjects/weather-data/weather-data
/Users/jsrnicek/IdeaProjects/weather-data/weather-data/raw-data
/Users/jsrnicek/IdeaProjects/weather-data/weather-data/raw-data/yearly-weather
/Users/jsrnicek/IdeaProjects/weather-data/weather-data/refined-data
/Users/jsrnicek/IdeaProjects/weather-data/weather-data/refined-data/yearly-data
/Users/jsrnicek/IdeaProjects/weather-data/weather-data/presentable-data


Count all raw data samples

In [3]:

Files.list(rawDataYearlyDataFolder)
    .map { Files.readAllLines(it).count() }
   .toList()
    .sum()

2658574

Refine data to only the ones that have minimum,maximum and average temparatures available

In [4]:

var counter = 0
Files.list(rawDataYearlyDataFolder)
    .toList()   
    .chunked(1000)
    .forEach { chunk ->
        val outFile = refinedDataYearlyDataFolder.resolve("yearly_data_part_$counter.csv")
        Files.writeString(outFile, chunk.map {
            DataFrame.read(it.toFile())
                .filter { it.containsKey("TAVG") }
                .filter { it.get("TAVG") != null }
                .filter { !tavg.getValue(it).isNaN() }
                .filter { it.containsKey("TMIN") }
                .filter { it.get("TMIN") != null }
                .filter { !tmin.getValue(it).isNaN() }
                .filter { it.containsKey("TMAX") }
                .filter { it.get("TMAX") != null }
                .filter { !tmax.getValue(it).isNaN() }
        }
            .toList()
            .concat()
            .toCsv(CSVFormat.DEFAULT))
        counter += 1
    }


Count data samples after refinement

In [5]:

Files.list(refinedDataYearlyDataFolder)
    .map { Files.readAllLines(it).count() }
    .toList()
    .sum()

869944

Aggregate all files to a single file with count of data samples, min,max and average temparatures

In [16]:


Files.writeString(presentableDataFolder.resolve("basic-temparatures.csv"), Files.list(refinedDataYearlyDataFolder)
    .toList()
    .chunked(5)
    .map { chunk ->
        chunk.map { DataFrame.readCSV(BufferedInputStream(it.inputStream())) }
            .toList()
            .concat()
            .groupBy(date)
            .sortBy(date)
            .aggregate {
                count() into count
                mean(tavg) into average
                mean(tmin) into min
                mean(tmax) into max
            }
    }
    .toList()
    .concat()
    .convert(count).with{it.toInt()}
    .groupBy(date)
    .sortBy(date)
    .aggregate {
        sum(count) into count
        mean(average) into average
        mean(min) into min
        mean(max) into max
    }
    .toCsv())

/Users/jsrnicek/IdeaProjects/weather-data/weather-data/presentable-data/basic-temparatures.csv

Plot min/max temparatures

In [17]:

DataFrame.readCSV(BufferedInputStream(presentableDataFolder.resolve("basic-temparatures.csv").toFile().inputStream()))
    .plot {
        line {
            x(date) {
                axis {
                    name = "Year"
                    breaks(format = "{d}")
                }
            }
            y(min) {
                scale = continuous(0.0..20.0)
                axis {
                    name = "Temperatures"
                }
            }
            color(min) {
                scale = continuous(Color.BLUE..Color.RED)
                legend {
                    name = "Temparature in Celsius"
                }
            }
        }
        line {
            x(date)
            y(max) { scale = continuous(0.0..20.0) }
            color(max) {
                scale = continuous(Color.BLUE..Color.RED)
                legend {
                    name = "Temparature in Celsius"
                }
            }
        }

        layout.size = 1500 to 800
    }

Plot weather stations

In [18]:

DataFrame.readCSV(BufferedInputStream(presentableDataFolder.resolve("basic-temparatures.csv").toFile().inputStream()))
    .plot {
        line {
            x(date) {
                axis {
                    name = "Year"
                    breaks(format = "{d}")
                }
            }
            y(count) {
                axis {
                    name = "Weather stations"
                }
            }
            color(count) {
                scale = continuous(Color.RED..Color.GREEN)
                legend {
                    name = "Weather stations"
                }
            }
        }

        layout.size = 1500 to 800
    }

Put data for all stations into GPX format so we can feed it int omap

In [19]:
data class GpxEntry(val name: String, val latitude: Double, val longitude: Double) {
    override fun toString(): String {
        return """<wpt lat="${latitude}" lon="${longitude}">
		            <name>${name}</name>
	              </wpt>"""
    }
}

data class GpxTemplate(val points: List<GpxEntry>) {
    override fun toString(): String {
        return """<?xml version="1.0" encoding="UTF-8"?>
                    <gpx version="1.1" xmlns="http://www.topografix.com/GPX/1/1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
	                    ${points.map { it.toString() }.joinToString("\n")}
                    </gpx>"""
    }
}

In [20]:


val points = Files.list(refinedDataYearlyDataFolder)
    .toList()
    .chunked(5)
    .map { chunk ->
        chunk.map { DataFrame.readCSV(BufferedInputStream(it.inputStream())) }
            .toList()
            .concat()
            .groupBy(name, latitude, longitude)
            .count()
    }
    .toList()
    .concat()
    .groupBy(name, latitude, longitude)
    .count()
    .map {
        GpxEntry(
            name = it.getValue(name), latitude = it.getValue(latitude), longitude = it.getValue(longitude)
        )
    }
    .toList()

Files.writeString(presentableDataFolder.resolve("gpx-visualization.gpx"),GpxTemplate(points).toString())


/Users/jsrnicek/IdeaProjects/weather-data/weather-data/presentable-data/gpx-visualization.gpx

In [21]:
Files.writeString(presentableDataFolder.resolve("duplicate-values.csv"),Files.list(refinedDataYearlyDataFolder)
    .toList()
    .chunked(5)
    .map { chunk ->
        chunk.map {
            DataFrame.readCSV(BufferedInputStream(it.inputStream()))
                .filter { date.getValue(it) >= 1900 }
        }
            .toList()
            .concat()
            .groupBy(name,date)
            .aggregate {
                count() into count
            }
    }
    .toList()
    .concat()
    .convert(count).with { it.toInt() }
    .groupBy(name,date)
    .aggregate {
        sum(count) into count
    }
    .filter { count.getValue(it) > 1 }//duplicates
    .sortByDesc(date)
    .toCsv())

class java.lang.Double cannot be cast to class java.lang.Integer (java.lang.Double and java.lang.Integer are in module java.base of loader 'bootstrap')
java.lang.ClassCastException: class java.lang.Double cannot be cast to class java.lang.Integer (java.lang.Double and java.lang.Integer are in module java.base of loader 'bootstrap')
	at Line_28_jupyter$res28$3.invoke(Line_28.jupyter.kts:28)
	at Line_28_jupyter$res28$3.invoke(Line_28.jupyter.kts:20)
	at org.jetbrains.kotlinx.dataframe.impl.GroupByImplKt.aggregateGroupBy(GroupByImpl.kt:87)
	at org.jetbrains.kotlinx.dataframe.impl.GroupByImpl.aggregate(GroupByImpl.kt:58)
	at Line_28_jupyter.<init>(Line_28.jupyter.kts:20)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
	at kotlin.script.experimental.jvm.Bas

In [22]:
DataFrame.readCSV(BufferedInputStream(presentableDataFolder.resolve("duplicate-values.csv").inputStream()))
    .take(10)

NAME,DATE,count
SAVANNAH HILTON HEAD INTERNATIONAL AI...,2023,2.0
"MARYBOROUGH, AS",2023,2.0
"TERRE HAUTE HULMAN REGIONAL AIRPORT, ...",2023,2.0
"APALACHICOLA AIRPORT, FL US",2023,2.0
"VICTORIA REGIONAL AIRPORT, TX US",2023,2.0
"SAN JOSE, CA US",2023,2.0
"CORNER BROOK, NL CA",2023,2.0
"YORKTON, SK CA",2023,2.0
"BAIE COMEAU, QC CA",2023,2.0
"TROIS RIVIERES, QC CA",2023,2.0


In [29]:
val limitYear = 1950
val expectedValueCount = 2023 - limitYear + 1

val potentialCandidates = Files.list(refinedDataYearlyDataFolder)
    .toList()
    .chunked(5)
    .map { chunk ->
        chunk.map {
            DataFrame.readCSV(BufferedInputStream(it.inputStream()))
                .filter { date.getValue(it) >= limitYear }
        }
            .toList()
            .concat()
            .groupBy(name)
            .aggregate {
                count() into count
            }
    }
    .toList()
    .concat()
    .convert(count).with { it.toInt() }
    .groupBy(name)
    .aggregate {
        sum(count) into count
    }
    .filter { count.getValue(it) >= expectedValueCount }//duplicates
    .sortByDesc(count)
    .select(name)
    .convert(name).with { it.toString() }
    .map { name.getValue(it) }

val weatherStations = Files.list(refinedDataYearlyDataFolder)
    .toList()
    .chunked(5)
    .map { chunk ->
        chunk.map {
            DataFrame.readCSV(BufferedInputStream(it.inputStream()))
                .filter { potentialCandidates.contains(name.getValue(it)) }
                .filter { date.getValue(it) >= expectedValueCount }
        }
            .toList()
            .concat()
    }
    .toList()
    .concat()
    .distinctBy(name, date)
    .groupBy(name)
    .sortBy(name, date)
    .aggregate {
        count() into count
    }
    .convert(count).with { it.toInt() }
    .filter { count.getValue(it) == expectedValueCount }
    .select(name)
    .convert(name).with { it.toString() }
    .map { name.getValue(it) }


Files.writeString(refinedDataYearlyDataFolder.resolve("solid-data-since-$limitYear.csv"),
    Files.list(refinedDataYearlyDataFolder)
        .toList()
        .chunked(5)
        .map { chunk ->
            chunk.map {
                DataFrame.readCSV(BufferedInputStream(it.inputStream()))
                    .filter { weatherStations.contains(name.getValue(it)) }
                    .filter { date.getValue(it) >= limitYear }
            }
                .toList()
                .concat()
        }
        .toList()
        .concat()
        .toCsv())

/Users/jsrnicek/IdeaProjects/weather-data/weather-data/refined-data/yearly-data/solid-data-since-1950.csv

In [30]:
Files.writeString(presentableDataFolder.resolve("yearly-temparatures-since-$limitYear.csv"),DataFrame.readCSV(BufferedInputStream(
    refinedDataYearlyDataFolder.resolve("solid-data-since-$limitYear.csv").inputStream()
))
    .groupBy(date)
    .aggregate {
        mean(tmin) into min
        mean(tmax) into max
        mean(tavg) into average
    }
    .toCsv())

/Users/jsrnicek/IdeaProjects/weather-data/weather-data/presentable-data/yearly-temparatures-since-1950.csv

Plot solid data since 1900

In [31]:
DataFrame.readCSV(BufferedInputStream(presentableDataFolder.resolve("yearly-temparatures-since-$limitYear.csv").toFile().inputStream()))
    .plot {
        line {
            x(date) {
                axis {
                    name = "Year"
                    breaks(format = "{d}")
                }
            }
            y(min) {
                scale = continuous(0.0..20.0)
                axis {
                    name = "Temperatures"
                }
            }
            color(min) {
                scale = continuous(Color.BLUE..Color.RED)
                legend {
                    name = "Temparature in Celsius"
                }
            }
        }
        line {
            x(date)
            y(max) { scale = continuous(0.0..20.0) }
            color(max) {
                scale = continuous(Color.BLUE..Color.RED)
                legend {
                    name = "Temparature in Celsius"
                }
            }
        }

        layout.size = 1500 to 800
    }