In [1]:
import pandas as pd

df = pd.read_csv("dataset/SecondaryData/secondary_data_no_miss.csv",sep=';')

In [2]:
# checks if a str is a number that could be interpreted as a float
def is_number(val):
    """
    Parameters
    ----------
    val: str, arbitrary input

    Returns
    -------
    bool, True if val is interpretable as a float and False else
    """

    try:
        float(val)
        return True
    except ValueError:
        return False

In [16]:
import matplotlib.pyplot as plt

In [2]:
import altair as alt
from dython import nominal
import altair_viewer
import scipy
from altair import pipe, limit_rows, to_values
t = lambda data: pipe(data, limit_rows(max_rows=100000), to_values)
alt.data_transformers.register('custom', t)
alt.data_transformers.enable('custom')
alt.renderers.enable('altair_viewer', inline=True)


def get_correlation_dataframe(data, **kwargs):
    """
    Parameters
    ----------
    data: pandas.DataFrame
    DataFrame with nominal or metrical columns

    kwargs:
    show_progress: bool, default=False
    Prints each row if True

    Returns
    -------
    var name=data_corr: pandas.DataFrame,
    with two column names and their correlation
    """

    if 'show_progress' not in kwargs:
        kwargs['show_progress'] = False
    data_corr = pd.DataFrame(columns=['variable1', 'variable2', 'correlation', 'correlation_rounded'])
    for variable1 in data:
        for variable2 in data:
            # nominal-nominal -> Theils U
            if type(data[variable1][0]) == str and type(data[variable2][0]) == str:
                corr = nominal.theils_u(data[variable1], data[variable2], nan_replace_value='f')
            # metircal-metrical -> Pearsons R
            elif is_number(data[variable1][0]) and is_number(data[variable2][0]):
                corr = scipy.stats.stats.pearsonr(data[variable1], data[variable2])[0]
                # change range from [-1, 1] to [0, 1] as the other metrics
                corr = (corr + 1) / 2
            # metrical-nominal -> correlation ratio
            elif type(data[variable1][0]) == str and is_number(data[variable2][0]):
                corr = nominal.correlation_ratio(data[variable1], data[variable2], nan_replace_value='f')
            elif type(data[variable2][0]) == str and is_number(data[variable1][0]):
                corr = nominal.correlation_ratio(data[variable2], data[variable1], nan_replace_value='f')
            else:
                print('var1-type: ' + str(type(data[variable1][0])) + ', var2-type: ' + str(type(data[variable2][0])))
                print('var1: ' + str(data[variable1][0]) + ', var2: ' + str(data[variable2][0]))
            new_row = {'variable1': variable1, 'variable2': variable2,
                'correlation': corr, 'correlation_rounded': round(corr, 2)}
            data_corr = pd.concat([data_corr, pd.DataFrame([new_row])], ignore_index=True)
            if kwargs['show_progress']:
                print(new_row)
    return data_corr


def get_correlation_chart(data, **kwargs):
    """
    Parameters
    ----------
    data: pandas.DataFrame
    data with nominal or metrical columns

    kwargs:
    show_progress: bool, default=False,
    prints each row if True

    Returns
    -------
    altair.Chart,
    correlation heatmap of the data columns based on get_correlation_dataframe
    """

    if 'show_progress' not in kwargs:
        kwargs['show_progress'] = False

    data_corr = get_correlation_dataframe(data, show_progress=kwargs['show_progress'])

    base_chart = alt.Chart(data_corr).encode(
        alt.X('variable1:N', sort=data.columns.values),
        alt.Y('variable2:N', sort=data.columns.values)
    )

    corr_chart = base_chart.mark_rect().encode(
        alt.Color('correlation:Q', scale=alt.Scale(scheme='greys')),
    )

    text_chart = base_chart.mark_text().encode(
        alt.Text('correlation_rounded:Q'),
        color = (alt.condition(
            alt.datum.correlation > 0.5,
            alt.value('white'),
            alt.value('black')
        ))
    )

    return corr_chart + text_chart


ValueError: 
To use the 'altair_viewer' renderer, you must install the altair_viewer
package; see http://github.com/altair-viz/altair_viewer/
for more information.


In [20]:
from sklearn.preprocessing import LabelEncoder
encoded_data = df.copy()
le = LabelEncoder()
encoded_data['class'] = le.fit_transform(df['class'])
encoded_data = pd.get_dummies(encoded_data)

chart = get_correlation_chart(encoded_data)
view.show(chart)


  corr = scipy.stats.stats.pearsonr(data[variable1], data[variable2])[0]


RuntimeError: Internal: _stream is not defined.