# Imports

In [1]:
using DataFrames
using Statistics
using CSV
using Dates
using Pipe: @pipe
using CategoricalArrays
using ShiftedArrays
using StatsBase

In [2]:
ENV["COLUMNS"]=1200

1200

# Load and Process Data

In [3]:
df_temp = CSV.File("Data/GlobalLandTemperaturesByCity.csv") |> DataFrame;

In [4]:
size(df_temp)

(8599212, 7)

In [5]:
first(df_temp, 5)

Unnamed: 0_level_0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
Unnamed: 0_level_1,Date,Float64?,Float64?,String31,String63,String7,String7
1,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
2,1743-12-01,missing,missing,Århus,Denmark,57.05N,10.33E
3,1744-01-01,missing,missing,Århus,Denmark,57.05N,10.33E
4,1744-02-01,missing,missing,Århus,Denmark,57.05N,10.33E
5,1744-03-01,missing,missing,Århus,Denmark,57.05N,10.33E


In [6]:
@time @pipe df_temp |>
transform(
    _,
    :dt => ByRow(year) => :year,
    :dt => ByRow(monthabbr) => :month,
    :dt => ByRow(day) => :day,
    :City => categorical => :City,
    :Country => categorical => :Country
) |>
transform(_, :month => categorical => :month) |>
dropmissing(_) |>
groupby(_, [:City, :year]) |>
transform(_, nrow) |>
filter(:nrow => x -> x > 11, _) |>
filter(:month => x -> x=="Dec", _) |>
groupby(_, [:year, :City, :Country, :Latitude, :Longitude]) |>
combine(_, :AverageTemperature => mean => :AvgTemp_year) |>
transform(
    _,
    :Latitude => ByRow(x -> contains(x, "W") ?  -1 : 1) => :signLat,
    :Latitude => ByRow(x -> parse(Float64, split(String(x), r"[a-zA-Z]")[1])) => :Lat,
    :Longitude => ByRow(x -> contains(x, "S") ?  -1 : 1) => :signLong,
    :Longitude => ByRow(x -> parse(Float64, split(String(x), r"[a-zA-Z]")[1])) => :Long,
) |>
transform(
    _, [:Lat, :signLat] => ByRow((x,y) -> x*y) => :Lat,
    [:Long, :signLong] => ByRow((x,y) -> x*y) => :Long
) |>
select(_, [:year, :City, :Country, :AvgTemp_year, :Lat, :Long]) |>
first(_, 5)

 15.229073 seconds (43.84 M allocations: 6.249 GiB, 4.93% gc time, 0.21% compilation time)


Unnamed: 0_level_0,year,City,Country,AvgTemp_year,Lat,Long
Unnamed: 0_level_1,Int64,Cat…,Cat…,Float64,Float64,Float64
1,1753,Århus,Denmark,-3.228,57.05,10.33
2,1754,Århus,Denmark,1.45,57.05,10.33
3,1755,Århus,Denmark,1.664,57.05,10.33
4,1756,Århus,Denmark,-0.652,57.05,10.33
5,1757,Århus,Denmark,0.325,57.05,10.33


In [7]:
function shift_day(lowInt=1, highInt=9)
    rng = [lowInt:1:highInt;]
    rng = [x for x in rng if x!=0]
    days_shift = sample(rng)
    return Dates.Day(days_shift)
end

shift_day (generic function with 3 methods)

In [48]:
@pipe df_temp |>
    groupby(_, :City) |>
    transform(_, :dt => (x -> x .+ shift_day(-9, 9)) => :date_shifted) |>
    sort(_, :dt)

Unnamed: 0_level_0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,date_shifted
Unnamed: 0_level_1,Date,Float64?,Float64?,String31,String63,String7,String7,Date
1,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E,1743-11-07
2,1743-11-01,10.013,2.291,Çorlu,Turkey,40.99N,27.69E,1743-10-25
3,1743-11-01,10.779,1.942,A Coruña,Spain,42.59N,8.73W,1743-10-31
4,1743-11-01,6.425,1.628,Aachen,Germany,50.63N,6.34E,1743-11-09
5,1743-11-01,6.068,1.737,Aalborg,Denmark,57.05N,10.33E,1743-10-29
6,1743-11-01,8.758,1.886,Aberdeen,United Kingdom,57.05N,1.48W,1743-10-28
7,1743-11-01,7.478,1.866,Aix En Provence,France,44.20N,4.47E,1743-10-31
8,1743-11-01,3.209,1.961,Akron,United States,40.99N,80.95W,1743-10-24
9,1743-11-01,7.801,2.002,Albacete,Spain,39.38N,2.08W,1743-11-06
10,1743-11-01,6.652,2.015,Alcalá De Henares,Spain,40.99N,4.26W,1743-11-03


In [38]:
df_temp2 = @pipe df_temp |>
    transform(_, :dt => (x -> x - lag(x)) => :date_interval) |>
    transform(_, [:date_interval, :dt] => ByRow( (x,y) -> ismissing(x) ? y : x + shift_day(-9, 9)) => :date_interval_shifted) |>
    transform(_, :date_interval_shifted => cumsum => :date_shifted) |>
    transform(_, :date_shifted => ByRow(Date) => :date_shifted);

In [39]:
first(df_temp2, 5)

Unnamed: 0_level_0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,date_interval,date_interval_shifted,date_shifted
Unnamed: 0_level_1,Date,Float64?,Float64?,String31,String63,String7,String7,Day?,Abstrac…,Date
1,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E,missing,1743-11-01,1743-11-01
2,1743-12-01,missing,missing,Århus,Denmark,57.05N,10.33E,30 days,21 days,1743-11-22
3,1744-01-01,missing,missing,Århus,Denmark,57.05N,10.33E,31 days,26 days,1743-12-18
4,1744-02-01,missing,missing,Århus,Denmark,57.05N,10.33E,31 days,22 days,1744-01-09
5,1744-03-01,missing,missing,Århus,Denmark,57.05N,10.33E,29 days,38 days,1744-02-16
