In [1]:
!pip install pandas pycountry modin[dask] geopy pycountry-convert

Looking in indexes: https://pypi.org/simple, https://alexandr.onbysh%40ring.com:****@artifactory.svc.ring.com/api/pypi/pypi-local/simple
[33mYou are using pip version 19.0.3, however version 20.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import pycountry
from typing import Optional
from vega_datasets import data
from pprint import pprint
from geopy.geocoders import GoogleV3
from geopy.exc import GeocoderTimedOut
from multiprocessing import Pool
import time
from geopy.extra.rate_limiter import RateLimiter
import pycountry_convert as pc


import plotly.graph_objects as go
import altair as alt
from tqdm import tqdm
import numpy as np

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

alt.data_transformers.disable_max_rows()
tqdm.pandas()
%load_ext nb_black
%matplotlib inline


The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version



<IPython.core.display.Javascript object>

## Data preparation
You could start from "Prepared results"

In [None]:
def get_country_by_ab(ab) -> Optional[str]:
    country = pycountry.countries.get(alpha_3=ab)
    if country:
        return country.name


def alpha_2_to_alpha_3(ab) -> Optional[str]:
    country = pycountry.countries.get(alpha_2=ab)
    if country:
        return country.alpha_3

In [None]:
stations = pd.read_csv("data/stations.csv")
stations = stations.dropna(subset=['lat', 'lon'])
stations = stations.rename(columns={"usaf": "stn"})
stations['code'] = None
stations.head(10)

In [None]:
geolocator = GoogleV3(api_key='api-key')
geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1)
def find_country(coords, *args, **kwargs):
    try:
        lat, lon = coords
        location = geocode([lat, lon], exactly_one=True)
        for record in location.raw['address_components']:
            if 'country' in record['types']:
                return alpha_2_to_alpha_3(record['short_name'])
    except GeocoderTimedOut:
        return find_country(lat, lon)
    except Exception as e:
        print(e)
    
print(find_country((33, -96)))

# parallel
# coords = list(zip(stations.lat.tolist(), stations.lon.tolist()))

# with Pool(64) as p:
#     r = list(tqdm(p.imap(find_country, coords), total=len(coords)))
# stations['code'] = r

# non parallel
# stations['code'] = stations.progress_apply(lambda row: find_country(row.lat, row.lon) if not row.code else row.code, axis=1)

# stage 1
# stations.code.sort_values().unique()
# stations['country'] = stations['code'].apply(lambda x : get_country_by_ab(x))
# stations = stations.set_index(["stn", "wban"])

# stage 2
# temp = pd.read_csv('data/temperature.csv', dtype={'stn': str})
# temp["avg_temp"] = (temp["avg_temp"] - 32) * 5 / 9
# temp = temp.set_index(['stn', 'wban'])
# temp = temp.join(stations, how='inner')
# temp = temp.reset_index()
# df = temp
# df.to_csv('data/aggregated_with_countries.csv')
# df.head()

## Prepared results

In [4]:
df = pd.read_csv("data/aggregated_with_countries.csv")
df = df.dropna(axis=0, subset=["code"])
grouped = df.groupby("code").mean()
df["diff_from_avg_temp"] = df.progress_apply(
    lambda x: x.avg_temp - grouped.loc[x.code].avg_temp, axis=1
)

100%|██████████| 475862/475862 [01:13<00:00, 6485.12it/s]


<IPython.core.display.Javascript object>

## Visualization

In [5]:
source = alt.topo_feature(data.world_110m.url, "countries")

color_scale = alt.Scale(scheme="redyellowblue")

slider = alt.binding_range(min=float(df.year.min()), max=float(df.year.max()), step=1)
select_year = alt.selection_single(
    name="year", fields=["year"], bind=slider, init={"year": float(df.year.min())}
)

background = (
    alt.Chart(source)
    .mark_geoshape(fill="lightgray", stroke="white")
    .properties(width=800, height=600)
    .project("equirectangular")
)

# airport positions on background
points = (
    alt.Chart(df[df.year == 2010])
    .mark_circle()
    .encode(
        longitude="lon:Q",
        latitude="lat:Q",
        tooltip=["code:N", "lon:Q", "lat:Q"],
        color=alt.Color("diff_from_avg_temp:Q", scale=color_scale, sort="descending"),
    )
    .properties()
)

background + points

<IPython.core.display.Javascript object>

## Plot 1

In [7]:
grouped = df.groupby(by=["country", "year", "code"]).mean()
grouped = grouped.reset_index(["country", "code"])

years = grouped.index.unique().sort_values().tolist()[:-2]
first_year = years[0]

fig = go.Figure(
    frames=[
        go.Frame(
            data=go.Choropleth(
                locations=grouped.loc[year]["code"],
                z=grouped.loc[year]["diff_from_avg_temp"],
                text=grouped.loc[year]["country"],
                colorscale=[
                    "rgb(77,116,177)",
                    "rgb(180, 212, 232)",
                    "rgb(245, 183, 106)",
                    "rgb(216, 83, 56)",
                ],
                zmin=-2,
                zmax=2,
                autocolorscale=False,
                reversescale=False,
                marker_line_color="darkgray",
                marker_line_width=0.5,
                colorbar_title="°C",
            ),
            name=year,
        )
        for year in years[1:]
    ]
)

fig.add_trace(
    go.Choropleth(
        locations=grouped.loc[first_year]["code"],
        z=grouped.loc[first_year]["diff_from_avg_temp"],
        text=grouped.loc[first_year]["country"],
        colorscale=[
            "rgb(77,116,177)",
            "rgb(180, 212, 232)",
            "rgb(245, 183, 106)",
            "rgb(216, 83, 56)",
        ],
        zmin=-3,
        zmax=3,
        autocolorscale=False,
        reversescale=False,
        marker_line_color="darkgray",
        marker_line_width=0.5,
        colorbar_title="°C",
    )
)


def frame_args(duration):
    return {
        "frame": {"duration": duration},
        "mode": "immediate",
        "fromcurrent": True,
        "transition": {"duration": duration, "easing": "linear"},
    }


sliders = [
    {
        "pad": {"b": 10, "t": 60},
        "len": 0.9,
        "x": 0.1,
        "y": 0,
        "steps": [
            {"args": [[f.name], frame_args(0)], "label": f.name, "method": "animate",}
            for k, f in enumerate(fig.frames)
        ],
    }
]

# Layout
#         titleFontSize=18,
#         titleFontWeight="normal",
#         titleFont="Ubuntu Mono",
#         titleColor="#5D646F",
#         titleX=55,
#         titleY=-10,
#         titleAngle=0,
#         labelFontSize=14,
#         labelFont="Ubuntu Mono",
#         labelFontWeight="lighter",
#         labelColor="#5D646F",
#         labelPadding=5,

fig.update_layout(
    title="Global temperature anomalies",
    width=1200,
    height=800,
    plot_bgcolor="blue",
    font=dict(family="Ubuntu Mono", size=22, color="#5D646F"),
    geo=dict(showframe=False, showcoastlines=False, projection_type="robinson"),
    updatemenus=[
        {
            "buttons": [
                {
                    "args": [None, frame_args(50)],
                    "label": "&#9654;",  # play symbol
                    "method": "animate",
                },
                {
                    "args": [[None], frame_args(0)],
                    "label": "&#9724;",  # pause symbol
                    "method": "animate",
                },
            ],
            "direction": "left",
            "pad": {"r": 10, "t": 70},
            "type": "buttons",
            "x": 0.1,
            "y": 0,
        }
    ],
    sliders=sliders,
)

fig.show()

<IPython.core.display.Javascript object>

## Temperature by decade

In [None]:
code_to_continent = {
    "AF": "Africa",
    "AN": "Antarctica",
    "AS": "Asia",
    "EU": "Europe",
    "NA": "North america",
    "OC": "Oceania",
    "SA": "South america",
}


def country_to_continent(country):
    try:
        continent = pc.country_alpha2_to_continent_code(
            pc.country_name_to_country_alpha2(country, cn_name_format="default")
        )
        return code_to_continent[continent]
    except:
        return None

In [None]:
data = df.copy()
index = df.groupby("country").median().sort_values("avg_temp").index

N = 42
top = [x for x in index.tolist() if len(x) < 20]
top = set([top[0]] + top[:: len(index) // N] + [index[25]] + [top[-1]])

data = data[data["country"].isin(top)]

rank = {
    x: rank
    for rank, x in enumerate(
        df.groupby("country").median().sort_values("avg_temp").index
    )
}
data["rank"] = data["country"].apply(lambda x: rank[x])
data = data.groupby(["country", "year"]).mean().sort_values("rank")

data = data.reset_index()
data["continent"] = data["country"].apply(lambda x: country_to_continent(x))
data.loc[data["country"] == "Antarctica", "continent"] = "Antarctica"
data = data.dropna(0, subset=["continent"])

In [None]:
temp_avg = float(round(data.avg_temp.mean(), 1),)
y_scale = np.arange(start=-20, stop=50, step=5).tolist() + [temp_avg]
y_scale.remove(20)
box = (
    alt.Chart(data)
    .mark_boxplot(extent="min-max")
    .encode(
        x=alt.X("country:O", sort=["rank"], axis=alt.Axis(labelAngle=90)),
        y=alt.Y(
            "avg_temp:Q", scale=alt.Scale(bins=y_scale), title="annual temperature °C",
        ),
        color=alt.Color(
            "continent:O",
            title="Part of world",
            scale=alt.Scale(
                range=[
                    "#E28230",
                    "#57BAEA",
                    "#F8CC46",
                    "#296AAE",
                    "#D22E26",
                    "#566594",
                    "#A92A7C",
                ]
            ),
        ),
    )
)
line = (
    alt.Chart(data)
    .mark_rule(color="#B7263D", strokeDash=[5, 3], strokeWidth=2, strokeOpacity=0.5)
    .encode(y="mean(avg_temp):Q")
)
world_mean = (
    alt.Chart(data)
    .mark_text(
        text=f"world mean {temp_avg} °C",
        fontSize=13,
        fontWeight="lighter",
        font="Ubuntu Mono",
    )
    .encode(
        opacity=alt.value(0.1),
        color=alt.value("#B7263D"),
        x=alt.value(70),
        y=alt.value(108),
    )
)
# line
plot = (
    (line + box + world_mean)
    .configure(
        title=alt.TitleConfig(
            anchor="start",
            color="#5D646F",
            fontSize=30,
            font="Ubuntu Mono",
            fontWeight=600,
        ),
        background="#F3F7F7",
        padding=40,
    )
    .configure_axisY(
        titleFontSize=18,
        titleFontWeight="normal",
        titleFont="Ubuntu Mono",
        titleColor="#5D646F",
        titleX=55,
        titleY=-10,
        titleAngle=0,
        labelFontSize=14,
        labelFont="Ubuntu Mono",
        labelFontWeight="lighter",
        labelColor="#5D646F",
        labelPadding=5,
    )
    .configure_axisX(
        titleFontSize=18,
        titleFontWeight="normal",
        titleFont="Ubuntu Mono",
        titleColor="#5D646F",
        titleX=920,
        titleY=-10,
        labelFontSize=14,
        labelAngle=0,
        labelFont="Ubuntu Mono",
        labelFontWeight="lighter",
        labelColor="#5D646F",
        labelPadding=5,
    )
    .configure_legend(
        titleFontSize=18,
        titleFontWeight="normal",
        titleFont="Ubuntu Mono",
        titleColor="#5D646F",
        labelFontSize=14,
        labelFont="Ubuntu Mono",
        labelFontWeight="lighter",
        labelColor="#5D646F",
        labelPadding=5,
    )
    .configure_axis(grid=False)
    .configure_view(stroke="#F3F7F7")
    .properties(title="World average temperature per country")
)
plot.save("plot2.png", scale_factor=2.0, webdriver="firefox")
plot

## Plot 3

In [None]:
data = df.copy()

rank = {
    x: rank
    for rank, x in enumerate(
        df.groupby("country").median().sort_values("avg_temp").index
    )
}
data["rank"] = data["country"].apply(lambda x: rank[x])
data = data.groupby(["country", "year"]).mean().sort_values("rank")

data = data.reset_index()
data["decade"] = data["year"].apply(lambda x: f"{x // 10}{0 if x % 10 < 5 else 5}'s")
# data["decade"] = data["year"].apply(lambda x: f"{x // 10}0's")
data = data[(data.year < 2010) & (data.year >= 1940)]

# N = 42
# top = [x for x in index.tolist() if len(x) < 20]
# top = set([top[0]] + top[:: len(index) // N] + [index[25]] + [top[-1]])

# data = data[data["country"].isin(top)]

data["continent"] = data["country"].apply(lambda x: country_to_continent(x))
data.loc[data["country"] == "Antarctica", "continent"] = "Antarctica"

In [None]:
step = 20
overlap = 8

plot = (
    alt.Chart(data, height=step)
    .transform_joinaggregate(mean_temp="mean(diff_from_avg_temp)", groupby=["decade"])
    .transform_bin(["bin_max", "bin_min"], "diff_from_avg_temp")
    .transform_aggregate(
        value="count()", groupby=["decade", "mean_temp", "bin_min", "bin_max"]
    )
    .transform_impute(
        impute="value", groupby=["decade", "mean_temp"], key="bin_max", value=0
    )
    .mark_area(
        interpolate="monotone", fillOpacity=0.8, stroke="lightgray", strokeWidth=1,
    )
    .encode(
        alt.X(
            "bin_max:Q",
            bin="binned",
            scale=alt.Scale(domain=[-12, 10]),
            title="Change in temperature observed at meteostation °C",
        ),
        alt.Y("value:Q", scale=alt.Scale(range=[step, -step * overlap]), axis=None),
        alt.Fill(
            "mean_temp:Q",
            scale=alt.Scale(domain=[-1, 1], scheme="lightmulti"),
            title="change in °C",
        ),
        tooltip=["decade", "mean_temp:Q"],
    )
    .facet(
        row=alt.Row(
            "decade",
            title=None,
            header=alt.Header(
                labelAlign="right",
                labelFontSize=18,
                labelAngle=0,
                labelFont="Ubuntu Mono",
                labelColor="#5D646F",
                labelPadding=5,
            ),
        )
    )
    .properties(title="Change in temperature compared to normal", bounds="flush")
    .configure_facet(spacing=0)
    .configure(
        title=alt.TitleConfig(
            dy=-20,
            anchor="start",
            color="#5D646F",
            fontSize=30,
            font="Ubuntu Mono",
            fontWeight=600,
        ),
        background="#F3F7F7",
        padding=40,
    )
    .configure_axisX(
        titleFontSize=18,
        titleFontWeight="normal",
        titleFont="Ubuntu Mono",
        titleColor="#5D646F",
        titleX=200,
        titleY=40,
        labelFontSize=14,
        labelAngle=0,
        labelPadding=10,
    )
    .configure_view(stroke="#F3F7F7")
    .configure_legend(
        titleFontSize=18,
        titleFont="Ubuntu Mono",
        titleColor="#5D646F",
        labelFontSize=14,
        labelFont="Ubuntu Mono",
        labelFontWeight="lighter",
        labelColor="#5D646F",
        gradientLength=523,
        gradientThickness=40,
    )
)
plot.save("plot3.png", scale_factor=2.0, webdriver="firefox")
plot