**Important:**
> If you are inspecting this notebook through Github, you should hover to the upper right part of the notebook (over theta symbol) and click on *external view available with nbviewer*. This is necessary if you want to see charts on Github.

---
# Exploratory Data Analysis
---
DataVis Supplementary Material

In [1]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn import datasets
from sklearn.metrics import mutual_info_score as mis
from sklearn.preprocessing import StandardScaler

In [2]:
# Uncomment if you are using dark jupyter lab/notebook theme
#alt.renderers.set_embed_options(theme='dark')

---
## Data import

In [3]:
data_iris = datasets.load_iris (as_frame = True).frame
data_diabetes = datasets.load_diabetes (as_frame = True).frame
data_wine = datasets.load_wine (as_frame = True).frame

In [4]:
data_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
data_diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [6]:
data_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


---
## Correlation matrix chart

In [7]:
def create_corr_chart (data):
    
    # Removing target column
    new_data = data.drop ('target', axis = 1)
    corr = new_data.corr(method = 'pearson').reset_index().melt('index')
    corr.columns = ['var_1', 'var_2', 'correlation']
    
    # Create correlation matrix chart
    chart = alt.Chart(corr).mark_rect().encode(
        alt.X ('var_1', title = None, axis = alt.Axis(labelAngle = -45)),
        alt.Y ('var_2', title = None),
        alt.Color('correlation', legend = None, scale = alt.Scale(scheme = 'redblue', reverse = True)),
    ).properties(
        width = alt.Step(40),
        height = alt.Step(40)
    )
    
    # Create text values for each colored element on top of existing chart
    chart += chart.mark_text(size = 12).encode(
        alt.Text ('correlation', format = ".2f"),
        color = alt.condition("abs(datum.correlation) > 0.5", alt.value('white'), alt.value('black'))
    )
    
    # If we want to return the chart with upper triangle as well, we should remove .transform...
    return chart.transform_filter("datum.var_1 < datum.var_2")
    

In [8]:
chart_corr_wine = create_corr_chart (data_wine)
chart_corr_wine
#chart_corr_diabetes = create_corr_chart (data_diabetes)
#chart_corr_diabetes


### Exercise:
(MEDIUM HARD) Calculate the mutual information matrix for each feature (dataframe column) and plot it.

---
## Parallel coordinates

In [9]:
def create_parallel_chart (data):
    
    new_data = data.reset_index().melt(id_vars = ['index', 'target'])
    
    chart = alt.Chart(new_data).mark_line().encode(
        alt.X ('variable:N'),
        alt.Y ('value:Q'),
        alt.Color ('target:N'),
        alt.Detail ('index:N'),
        opacity = alt.value(0.4),
    ).properties(width = 1000)
    
    return chart


In [10]:
# We have to omit 'proline' and 'magnesium' columns because they skew up the resulting chart. Try without removing
chart_parallel_wine = create_parallel_chart (data_wine.drop (['proline', 'magnesium'], axis = 1))
chart_parallel_wine
#chart_parallel_iris = create_parallel_chart (data_iris)
#chart_parallel_iris

In [11]:
# Let's see what happens if we scale data
scaler_wine = StandardScaler()
scaled_data_wine = scaler_wine.fit_transform (data_wine.drop ('target', axis = 1))
scaled_data_wine = pd.DataFrame(scaled_data_wine, columns = data_wine.drop ('target', axis = 1).columns)
scaled_data_wine['target'] = data_wine['target']
scaled_data_wine


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,1.518613,-0.562250,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.847920,1.013009,0
1,0.246290,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242,0
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.269020,0.318304,0.788587,1.395148,0
3,1.691550,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574,0
4,0.295700,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.876275,2.974543,0.305159,0.301803,-0.332922,-0.985614,-1.424900,1.274310,-0.930179,1.142811,-1.392758,-1.231206,-0.021952,2
174,0.493343,1.412609,0.414820,1.052516,0.158572,-0.793334,-1.284344,0.549108,-0.316950,0.969783,-1.129518,-1.485445,0.009893,2
175,0.332758,1.744744,-0.389355,0.151661,1.422412,-1.129824,-1.344582,0.549108,-0.422075,2.224236,-1.612125,-1.485445,0.280575,2
176,0.209232,0.227694,0.012732,0.151661,1.422412,-1.033684,-1.354622,1.354888,-0.229346,1.834923,-1.568252,-1.400699,0.296498,2


In [12]:
chart_parallel_scaled_wine = create_parallel_chart (scaled_data_wine.drop (['proline', 'magnesium'], axis = 1))
chart_parallel_scaled_wine

### Exercise:
(EASY) Instead of standardizing data, normalize it. Then plot that data with parallel coordinates chart.

### Exercise:
(HARD) Plot minimum, maximum and medium values for each and every variable on parallel coordinates chart. Help and examples can be found here:
* https://github.com/altair-viz/altair/issues/1034
* https://stackoverflow.com/questions/54671453/parallel-coordinates-in-vega-lite/54701776#54701776
* https://vega.github.io/vega/examples/parallel-coordinates/

---
## Scatter plot matrix

In [13]:
def create_scatter_matrix (data):
    
    # We want to remove 'target' from our list of features
    features = data.columns.values[data.columns.values != 'target']
    
    chart = alt.Chart(data).mark_circle().encode(
        alt.X(alt.repeat("column"), type = 'quantitative', scale = alt.Scale (nice = True)),
        alt.Y(alt.repeat("row"), type = 'quantitative', scale = alt.Scale (nice = True)),
        color = 'target:N'
    ).properties(
        width=150,
        height=150
    ).repeat(
        row = features,
        column = features
    )#.interactive()
    
    return chart


In [14]:
#chart_scatter_wine = create_scatter_matrix (data_wine)
#chart_scatter_wine
chart_scatter_iris = create_scatter_matrix (data_iris)
chart_scatter_iris


### Exercise:
(MEDIUM HARD) Show only the lower triangle of the image above.