## Data visualization final project

In [25]:
# import necessary libaries
import pandas as pd
import altair as alt

In [26]:
# import the data
data = pd.read_csv("Global Temperature Anomalies.csv")

Prepare the data for data viz

In [27]:
# take a look at the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 20 columns):
Hemisphere    423 non-null object
Year          423 non-null int64
Jan           423 non-null float64
Feb           423 non-null float64
Mar           423 non-null float64
Apr           423 non-null float64
May           423 non-null float64
Jun           423 non-null float64
Jul           423 non-null float64
Aug           423 non-null float64
Sep           423 non-null float64
Oct           423 non-null float64
Nov           423 non-null float64
Dec           423 non-null float64
J-D           423 non-null float64
D-N           420 non-null float64
DJF           420 non-null float64
MAM           423 non-null float64
JJA           423 non-null float64
SON           423 non-null float64
dtypes: float64(18), int64(1), object(1)
memory usage: 66.2+ KB


In [28]:
# there is a warning about making a copy of the dataframe
# in this case, we can safely ignore it
import warnings
warnings.filterwarnings('ignore')

# prepare data for viz
df = pd.DataFrame()
col_name = data.columns
for i in range(2,14):
    d = data[['Hemisphere','Year',col_name[i]]]
    d['Time'] = d['Year'].map(lambda x:str(x)+'-'+str(i-1))
    d= d.drop('Year',axis =1)
    d = d.rename(columns = {col_name[i]:'value'})
    df = pd.concat([df,d])
df   

Unnamed: 0,Hemisphere,value,Time
0,Global,-0.17,1880-1
1,Global,-0.18,1881-1
2,Global,0.18,1882-1
3,Global,-0.28,1883-1
4,Global,-0.12,1884-1
5,Global,-0.58,1885-1
6,Global,-0.43,1886-1
7,Global,-0.71,1887-1
8,Global,-0.33,1888-1
9,Global,-0.08,1889-1


In [29]:
# reset index
df = df.reset_index()

In [30]:
# take a look at the prepared data

# df.head(5)
# df.tail(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5076 entries, 0 to 5075
Data columns (total 4 columns):
index         5076 non-null int64
Hemisphere    5076 non-null object
value         5076 non-null float64
Time          5076 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 158.7+ KB


In [31]:
df['Time'] = pd.to_datetime(df['Time'],format = '%Y-%m')


In [32]:
# altair has limitations, can  only plot 5000 points
# our data has 5076 and here I decided to delete the recent 76 data
df=df.drop(index = range(5000,5076))

For task 1 : present the change of the temperature change

In [33]:
### base model

# select Golbal data
selection = alt.selection_multi(fields=["Hemisphere"],bind='legend')

s_3 = alt.Chart(df,title='Global Temperature change').mark_circle().encode(
#     x= "Time:T",
    x = alt.X('Time:T',axis=alt.Axis(format='%Y',title='date',tickSize=0)),
    y = alt.Y('value',axis=alt.Axis(format='.2f',title='Temperature change (C)')),
#     y = "value",
    color = alt.Color("Hemisphere", scale = alt.Scale(scheme="spectral")),
    tooltip = ['Hemisphere','Time','value'],
    opacity = alt.condition(selection,alt.value(1),alt.value(.2))
).add_selection(selection)

# add a reference line of 0 change
ref_line = alt.Chart().mark_rule(color = 'black',strokeWidth = 0.5).encode(
    y = alt.datum(0)
    )

# put all the layers into the chart and make it interactive
alt.layer(s_3,ref_line).properties(width=800,height=200).interactive()

1. From the graph, we can observe that the change of the temperature (Global, Northern Hemisphere, Southern Hemisphere) are having a upward trend and temperatures are rising in a steady speed.
2. By clicking on the Northern and Southern Hemisphere in the legend can see that Northern Hemisphere has higher temperature change that the Southern Hemisphere. 

For task 2 : experoling and generationg new knowledge

In [34]:
# interaction + click by legend + reference line to compair the same time with 
# the Global, Northern, southern Hemisphere's temperature change

# select Golbal data
selection = alt.selection_multi(fields=["Hemisphere"])
color = alt.condition(selection,
                     alt.Color('Hemisphere:N',scale=alt.Scale(scheme="magma"),legend=None),
                     alt.value('lightgray'))

s_3 = alt.Chart(df).mark_circle(opacity=0.3).encode(
    x = alt.X('Time:T',axis=alt.Axis(format='%Y',title='date')),
    y = alt.Y('value',axis=alt.Axis(format='.2f',title='Temperature change (C)')),
    color = color,
    tooltip = ['Hemisphere','Time','value']
)

legend = alt.Chart(df).mark_point().encode(
    y=alt.Y('Hemisphere:N',axis=alt.Axis(orient='right')),
    color=color
).add_selection(selection)

ref_line = alt.Chart().mark_rule(color = 'black',strokeWidth = 0.5).encode(
    y = alt.datum(0)
    )

# x-value of the cursor
nearest = alt.selection(type = 'single',nearest = True, on="mouseover",
                        fields=['Time'],empty="none")

# a chart of different hemispheres
line_v = alt.Chart(df).mark_line(interpolate="basis").encode(
    x = 'Time:T',
    y = 'value:Q',
    color = 'Hemisphere:N')

selectors = alt.Chart(df).mark_point().encode(
    x="Time:T",
    opacity = alt.value(0),
).add_selection(nearest)

# hightlight the points on the line (whatever is selected based on x)
points = line_v.mark_point().encode(
    opacity = alt.condition(nearest, alt.value(2),alt.value(0))
)

# tooltip of the selected time point

text_1 = line_v.mark_text(align='left',dx=10,dy=-30).encode(
    text = alt.condition(nearest,'value:Q',alt.value(''))
#     ,color=alt.value('black')
)


# add a rule at the location of the selection
rule = alt.Chart(df).mark_rule(color="grey").encode(
    x = 'Time:T'
).transform_filter(nearest)


# put all the layers into the chart 
alt.layer(s_3,ref_line,selectors,text_1,points,rule).properties(width=800,height=200).interactive()| legend

1. By clicking the legend, the user can select different Hemisphere or the Global trend.
2. Hovering the mouse on the data, the vertical reference line shows the time point choosen and also shows the values of temperature change at that time point in Northern Hemisphere, Southern Hemisphere and the Global.
3. By zooming, can find even more detailed informaiton of each location and time point.

below is the design that I intianlly had. Design 1 in my experiment. In this design, I put each independent trend line(Northern, Southern,Global) in one chart in a vertical line. After the experiment and interview with my participants, I decided to abandon this design and went with the ones above.

In [35]:
# separate data
N = df[df['Hemisphere']=='Northern']
S = df[df['Hemisphere']=='Southern']
G = df[df['Hemisphere']=='Global']

In [36]:
s_N = alt.Chart(N, title='Temperature change of Northern Hemisphere').mark_circle(color='blue',opacity=0.2).encode(
    x = alt.X('Time:T',axis=alt.Axis(format='%Y',title='date',tickSize=0)),
    y = alt.Y('value',axis=alt.Axis(format='.2f',title='Temperature change (C)'),
             scale=alt.Scale(domain=(-2,2))),
    tooltip = ['Hemisphere','Time','value']
).properties(width=800,height=200)

s_S = alt.Chart(S, title='Temperature change of Southern Hemisphere').mark_circle(color='green',opacity=0.2).encode(
    x = alt.X('Time:T',axis=alt.Axis(format='%Y',title='date',tickSize=0)),
    y = alt.Y('value',axis=alt.Axis(format='.2f',title='Temperature change (C)'),
             scale=alt.Scale(domain=(-2,2))),
    tooltip = ['Hemisphere','Time','value']
).properties(width=800,height=200)

s_G = alt.Chart(G, title='Temperature change of Global').mark_circle(color='orange',opacity=0.2).encode(
    x = alt.X('Time:T',axis=alt.Axis(format='%Y',title='date',tickSize=0)),
    y = alt.Y('value',axis=alt.Axis(format='.2f',title='Temperature change (C)'),
             scale=alt.Scale(domain=(-2,2))),
    tooltip = ['Hemisphere','Time','value']
).properties(width=800,height=200)

ref_line = alt.Chart().mark_rule(color = 'black',strokeWidth = 0.5).encode(
    y = alt.datum(0)
    )
# s_N |s_S |s_G

# add trend lines
L_line_G=s_G.transform_loess('Time','value').mark_line(color='orange')
L_line_S=s_S.transform_loess('Time','value').mark_line(color='green')
L_line_N=s_N.transform_loess('Time','value').mark_line(color='blue')

# vertical stack 3 graphs
alt.vconcat(
alt.layer(s_N,ref_line,L_line_N),alt.layer(s_S,ref_line,L_line_S),alt.layer(s_G,ref_line,L_line_G)
)