In [1]:
from os.path import join as dir_join
from os.path import exists as dir_exists
from os import makedirs, listdir
import re
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time
from itertools import combinations
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from IPython.core import display as ICD

from utills import Candidate, Platoon
from pattern_miner import Miner

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

def get_trajectory_id(text):
    m = re.search('client_(.+).csv', text)
    if m:
        found = m.group(1)
        return found
    else:
        raise ValueError()

TRAJ_FOLDER = 'paths'
columns = ['lat', 'long', 'datetime', 'trajectory_id']
FILE_NAME = dir_join(TRAJ_FOLDER, 'processed.csv')

if not dir_exists(FILE_NAME):
    if not dir_exists(TRAJ_FOLDER):
        raise ValueError(TRAJ_FOLDER + ' does not exist')
    folder_files = istdir(TRAJ_FOLDER)
    list_df = []
    for filename in folder_files:
        df = pd.read_csv(dir_join(TRAJ_FOLDER, filename), names=columns)
        df['datetime'] = pd.to_datetime(df['datetime'], unit='s')
        df['trajectory_id'] = get_trajectory_id(filename)
        list_df += [df]
    df = pd.concat(list_df, ignore_index=True)
    df.to_csv(FILE_NAME, index=False)

df = pd.read_csv(FILE_NAME, parse_dates=[columns.index('datetime')], dtype={'lat': np.float32, 'long': np.float32, 'trajectory_id': np.str_})
df.head()

Unnamed: 0,lat,long,datetime,trajectory_id
0,359.5,416.5,2017-12-21 13:01:42,4976
1,359.5,416.5,2017-12-21 13:01:49,4976
2,359.5,416.5,2017-12-21 13:01:54,4976
3,364.5,426.5,2017-12-21 13:01:59,4976
4,379.5,456.5,2017-12-21 13:02:06,4976


In [2]:
savefolder = 'results_large'
if not dir_exists(savefolder):
    makedirs(savefolder)

sampling_interval = pd.Timedelta(minutes=1)
split_border = pd.Timedelta(days=1)
max_time_interval = df['datetime'].max()-df['datetime'].min()
pl = Platoon(2, 60, 5, max_time_interval // sampling_interval)
miner = Miner(df, pl, sampling_interval)

time1 = time.time()
print('Start time: ' + time.ctime())
#miner.extract_staypoints_heatmap(10)
#miner.save_staypoints_heatmap(dir_join(savefolder, 'staypoints_heatmap.npy'))
miner.load_staypoints_heatmap(dir_join(savefolder, 'staypoints_heatmap.npy'))
time2 = time.time()
print('Staypoints extraction done. Time: ' + str(time2 - time1))
miner.unify_datetime(split_border)
time3 = time.time()
print('Data unification done. Time: ' + str(time3 - time2))
#miner.compute_candidate_stars(5)
#miner.save_candidate_stars(dir_join(savefolder, 'candidate_stars.json'))
miner.load_candidate_stars(dir_join(savefolder, 'candidate_stars.json'))
time4 = time.time()
print('Candidate stars computing done. Time: ' + str(time4 - time3))
miner.compute_pattern_set()
time5 = time.time()
print('Pattern set computing done. Time: ' + str(time5 - time4))
#miner.compute_connection_rate()
#miner.save_connection_rate(dir_join(savefolder, 'connection_rate.npz'))
miner.load_connection_rate(dir_join(savefolder, 'connection_rate.npz'))
time6 = time.time()
print('Connection rate computing done. Time: ' + str(time6 - time5))
print('All time: ' + str(time6 - time1))
print('Finish time: ' + time.ctime())

Start time: Sat May 12 15:48:48 2018
Staypoints extraction done. Time: 0.011251211166381836
Data unification done. Time: 206.62967610359192
Candidate stars computing done. Time: 0.1656208038330078
Pattern set computing done. Time: 8.089789628982544
Connection rate computing done. Time: 0.00606083869934082
All time: 214.9023985862732
Finish time: Sat May 12 15:52:23 2018


In [3]:
userinfo = pd.read_csv('user_info.csv', parse_dates=['userinfo_dateofbirth'], dtype={'userinfo_sso': np.str_}).fillna('not_set')
bins = pd.date_range(userinfo['userinfo_dateofbirth'].min() - pd.Timedelta(days=365), pd.to_datetime('today'), freq='10Y')
bins = bins[[0,3]+list(range(6,len(bins)))] # drop redundant
userinfo['dateofbirth_cat'] = pd.cut(userinfo['userinfo_dateofbirth'], bins, labels=[str(bins[i-1].year)+'-'+str(bins[i].year) for i in range(1, len(bins))])

race_cut = {}
race_cut['GreaterEuropean,WestEuropean,Italian'] = 'Italian'
race_cut['GreaterEuropean,WestEuropean,Germanic'] = 'Germanic'
race_cut['GreaterEuropean,WestEuropean,Nordic'] = 'Nordic'
race_cut['GreaterEuropean,British'] = 'British'
race_cut['GreaterAfrican,Africans'] = 'Africans'
race_cut['Asian,GreaterEastAsian,EastAsian'] = 'EastAs'
race_cut['GreaterEuropean,WestEuropean,French'] = 'French'
race_cut['GreaterEuropean,EastEuropean'] = 'EastEu'
race_cut['GreaterEuropean,WestEuropean,Hispanic'] = 'Hispanic'
race_cut['GreaterAfrican,Muslim'] = 'Muslim'
race_cut['GreaterEuropean,Jewish'] = 'Jewish'
race_cut['Asian,IndianSubContinent'] = 'Indian'
race_cut['Asian,GreaterEastAsian,Japanese'] = 'Japanese'
userinfo['race'] = userinfo['race'].apply(lambda x: race_cut[x])

userinfo.head()

Unnamed: 0,userinfo_sso,userinfo_dateofbirth,userinfo_language,race,Gender,dateofbirth_cat
0,1,1970-01-01,de,Italian,male,1969-1979
1,5,1984-11-01,en,Italian,male,1979-1989
2,13,1970-01-01,de,Germanic,male,1969-1979
3,25,1961-09-23,en,Italian,female,1939-1969
4,28,1965-10-15,en,Germanic,male,1939-1969


In [4]:
savefolder_userinfo = dir_join(savefolder, 'userinfo_hist')
if not dir_exists(savefolder_userinfo):
    makedirs(savefolder_userinfo)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in cat_cols:
    for i in G:
        column = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name]
        pd.Series(dict((y, x) for x, y in np.array(np.unique(column, return_counts=True))[::-1].T)).plot(kind='bar')
        plt.title(' '.join([column_name, 'Person', str(i)]))
        plt.tight_layout()
        plt.savefig(dir_join(savefolder_userinfo, '.'.join([str(i), column_name, 'png'])))
        plt.clf()
    column = userinfo[column_name]
    pd.Series(dict((y, x) for x, y in np.array(np.unique(column, return_counts=True))[::-1].T)).plot(kind='bar')
    plt.title(' '.join([column_name, 'All']))
    plt.tight_layout()
    plt.savefig(dir_join(savefolder_userinfo, '.'.join([column_name, 'All', 'png'])))
    plt.clf()

<matplotlib.figure.Figure at 0x7f7fc26563c8>

In [5]:
savefolder_unary = dir_join(savefolder, 'unary_hist')
if not dir_exists(savefolder_unary):
    makedirs(savefolder_unary)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in cat_cols:
    values_dict = {}
    for value in userinfo[column_name].unique():
        values_dict[value] = pd.Series()
    for i in G:
        value = userinfo[userinfo['userinfo_sso']==i][column_name].iloc[0]
        column = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name]
        if(len(column) > 0):
            values_dict[value] = values_dict[value].add(column.value_counts(normalize=True), fill_value=0)
    for value in values_dict:
        if(len(values_dict[value]) > 0):
            values_dict[value].plot(kind='bar')
            plt.title(' '.join([column_name, value]))
            plt.tight_layout()
            plt.savefig(dir_join(savefolder_unary, '.'.join([column_name, value, 'png'])))
            plt.clf()
        else:
            print(' '.join(['Value', value, 'skipped']))

<matplotlib.figure.Figure at 0x7f7fc2cc0a20>

In [6]:
savefolder_binary = dir_join(savefolder, 'binary_hist')
if not dir_exists(savefolder_binary):
    makedirs(savefolder_binary)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in combinations(cat_cols, 2):
    column_name_list = list(column_name)
    values_dict = {}
    for value in userinfo[column_name_list].groupby(column_name_list).count().reset_index().values:
        values_dict[tuple(value)] = pd.Series()
    for i in G:
        value = tuple(userinfo[userinfo['userinfo_sso']==i][column_name_list].values[0])
        columns = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name_list].groupby(column_name_list).size()
        columns /= columns.sum()
        if(len(columns) > 0):
            values_dict[value] = values_dict[value].add(columns, fill_value=0) if len(values_dict[value]) > 0 else columns
    for value in values_dict:
        if(len(values_dict[value]) > 0):
            values_dict[value].plot(kind='bar')
            plt.title(' '.join([' '.join(column_name), ' '.join(value)]))
            plt.tight_layout()
            plt.savefig(dir_join(savefolder_binary, '.'.join(['_'.join(column_name), '_'.join(value), 'png'])))
            plt.clf()
        else:
            print(' '.join(['Value', ' '.join(value), 'skipped']))

Value de Italian skipped
Value de Nordic skipped
Value it EastEu skipped
Value it French skipped
Value it not_set skipped
Value de 1909-1939 skipped
Value de 1979-1989 skipped
Value it 1909-1939 skipped
Value it 1999-2009 skipped
Value Africans male skipped
Value EastAs male skipped
Value Hispanic female skipped
Value Nordic female skipped
Value Africans 1909-1939 skipped
Value Africans 1939-1969 skipped
Value Africans 1999-2009 skipped
Value EastAs 1909-1939 skipped
Value EastAs 1939-1969 skipped
Value EastAs 1989-1999 skipped
Value EastAs 1999-2009 skipped
Value EastEu 1909-1939 skipped
Value EastEu 1979-1989 skipped
Value EastEu 1999-2009 skipped
Value French 1909-1939 skipped
Value French 1999-2009 skipped
Value Hispanic 1909-1939 skipped
Value Hispanic 1979-1989 skipped
Value Hispanic 1999-2009 skipped
Value Indian 1909-1939 skipped
Value Indian 1939-1969 skipped
Value Indian 1979-1989 skipped
Value Italian 1909-1939 skipped
Value Japanese 1909-1939 skipped
Value Japanese 1939-196

<matplotlib.figure.Figure at 0x7f7fc2f915c0>

In [142]:
def get_common_data(df, neighbors, cat_cols, label_col, mode='unary'):
    data_dict = {}
    if mode == 'binary':
        for column_name in combinations(cat_cols, 2):
            column_name_list = list(column_name)
            values_dict = {}
            #unique_values = df[column_name_list].groupby(column_name_list).count().reset_index().values
            unique_values = df[column_name_list].groupby(column_name_list).size().index.tolist()
            for value in unique_values:
                #values_dict[tuple(value)] = pd.Series(index=unique_values).fillna(0)
                values_dict[value] = pd.Series(index=unique_values).fillna(0)
            for i in neighbors:
                value = tuple(df[df[label_col]==i][column_name_list].values[0])
                columns = df[df[label_col].isin(neighbors[i])][column_name_list].groupby(column_name_list).size()
                columns = pd.Series(columns.values / columns.sum(), index=columns.index.tolist())
                if len(columns) > 0:
                    values_dict[value] = values_dict[value].add(columns, fill_value=0)
            for value in values_dict:
                values_dict[value] = values_dict[value] / values_dict[value].sum()
            data_dict[column_name] = values_dict
    elif mode == 'unary':
        for column_name in cat_cols:
            values_dict = {}
            unique_values = df[column_name].unique()
            for value in unique_values:
                values_dict[value] = pd.Series(index=unique_values).fillna(0)
            for i in neighbors:
                value = df[df[label_col]==i][column_name].iloc[0]
                column = df[df[label_col].isin(neighbors[i])][column_name]
                if len(column) > 0:
                    values_dict[value] = values_dict[value].add(column.value_counts(normalize=True), fill_value=0)
            for value in values_dict:
                values_dict[value] = values_dict[value] / values_dict[value].sum()
            data_dict[column_name] = values_dict
    else:
        raise NotImplementedError()
    return data_dict

def get_user_data(df, neighbors, cat_cols, label_col, mode='unary'):
    data_dict = {}
    if mode == 'binary':
        for column_name in combinations(cat_cols, 2):
            column_name_list = list(column_name)
            values_dict = {}
            unique_values = df[column_name_list].groupby(column_name_list).count().reset_index().values
            for i in neighbors:
                values_dict[i] = pd.Series(index=unique_values).fillna(0)
                columns = df[df[label_col].isin(neighbors[i])][column_name_list].groupby(column_name_list).size()
                columns = pd.Series(columns.values / columns.sum(), index=columns.index.tolist())
                if len(columns) > 0:
                    values_dict[i] = values_dict[i].add(columns, fill_value=0)
            data_dict[column_name] = values_dict
    elif mode == 'unary':
        for column_name in cat_cols:
            values_dict = {}
            for i in neighbors:
                values_dict[i] = pd.Series(index=df[column_name].unique()).fillna(0)
                column = df[df[label_col].isin(neighbors[i])][column_name]
                if len(column) > 0:
                    values_dict[i] = values_dict[i].add(column.value_counts(normalize=True), fill_value=0)
            data_dict[column_name] = values_dict
    else:
        raise NotImplementedError()
    return data_dict

def test_prediction(df, G, cat_cols, label_col, n_splits, mode='unary'):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    X = df[df[label_col].isin(G)]
    score = dict((cat, []) for cat in cat_cols)
    cat_labels = dict((cat, np.sort(df[cat].unique())) for cat in cat_cols)
    score_labels = ['precision', 'recall', 'fscore', 'count']
    for train_index, test_index in kf.split(X):
        # prepare data, make histograms
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        neighbors_train = dict((i, list(G.neighbors(i))) for i in G if i in X_train[label_col].values)
        neighbors_test = dict((i, list(G.neighbors(i))) for i in G if i in X_test[label_col].values)
        data_train = get_common_data(X_train, neighbors_train, cat_cols, label_col, mode=mode)
        data_test = get_user_data(X_train, neighbors_test, cat_cols, label_col, mode=mode)
        # get true data
        true_dict = dict((col, pd.Series()) for col in cat_cols)
        for i in X_test.iterrows():
            for col in cat_cols:
                true_dict[col][i[1][label_col]] = i[1][col]
        # make a prediction
        pred_dict = {}
        for cat, persons_dict in data_test.items():
            pred_cat_dict = pd.Series()
            for i in persons_dict:
                deviation = [(value, np.linalg.norm(persons_dict[i] - hist, 1)) for value, hist in data_train[cat].items()]
                pred_cat_dict[i] = min(deviation, key=(lambda item:item[1]))[0]
            pred_dict[cat] = pred_cat_dict
        if mode == 'binary':
            pred_dict_final = {}
            for col in cat_cols:
                pred_cat_dict = pd.Series()
                for i in X_test[label_col]:
                    votes = [pred_dict[cat][i][cat.index(col)] for cat in pred_dict if col in cat]
                    pred_cat_dict[i] = max(votes, key=votes.count)
                pred_dict_final[col] = pred_cat_dict
            pred_dict = pred_dict_final
        # compute the metrics
        for cat in score:
            score[cat] += [precision_recall_fscore_support(true_dict[cat], pred_dict[cat], labels=cat_labels[cat])]
    # make metrics prettier
    for cat in score:
        mean_score = np.mean(score[cat], axis=0)
        mean_score[-1] *= n_splits
        score[cat] = pd.DataFrame(mean_score, index=score_labels, columns=cat_labels[cat]).round(2)
    return score

In [143]:
n_splits = 10
label_col = 'userinfo_sso'
G = miner.graph()
cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
score_unary = test_prediction(userinfo, G, cat_cols, label_col, n_splits, mode='unary')
score_binary = test_prediction(userinfo, G, cat_cols, label_col, n_splits, mode='binary')

savefolder_unary_res = dir_join(savefolder, 'unary_res')
if not dir_exists(savefolder_unary_res):
    makedirs(savefolder_unary_res)

for cat, df in score_unary.items():
    df.to_csv(dir_join(savefolder_unary_res, '.'.join([cat, 'unary', 'csv'])))
    print(' '.join([cat, 'unary']))
    ICD.display(df)
    print()
    print()

savefolder_binary_res = dir_join(savefolder, 'binary_res')
if not dir_exists(savefolder_binary_res):
    makedirs(savefolder_binary_res)

for cat, df in score_binary.items():
    df.to_csv(dir_join(savefolder_binary_res, '.'.join([cat, 'binary', 'csv'])))
    print(' '.join([cat, 'binary']))
    ICD.display(df)
    print()
    print()

userinfo_language unary


Unnamed: 0,de,en,it
precision,0.07,0.85,0.05
recall,0.15,0.38,0.27
fscore,0.08,0.5,0.09
count,16.0,138.0,9.0




race unary


Unnamed: 0,Africans,British,EastAs,EastEu,French,Germanic,Hispanic,Indian,Italian,Japanese,Jewish,Muslim,Nordic
precision,0.05,0.08,0.0,0.1,0.0,0.12,0.0,0.1,0.05,0.0,0.05,0.0,0.05
recall,0.1,0.08,0.0,0.05,0.0,0.06,0.0,0.05,0.02,0.0,0.03,0.0,0.05
fscore,0.07,0.07,0.0,0.07,0.0,0.07,0.0,0.07,0.02,0.0,0.04,0.0,0.05
count,5.0,30.0,2.0,13.0,7.0,19.0,6.0,10.0,44.0,2.0,12.0,9.0,4.0




Gender unary


Unnamed: 0,female,male,not_set
precision,0.32,0.47,0.28
recall,0.4,0.28,0.52
fscore,0.32,0.33,0.32
count,49.0,84.0,30.0




dateofbirth_cat unary


Unnamed: 0,1909-1939,1939-1969,1969-1979,1979-1989,1989-1999,1999-2009
precision,0.0,0.08,0.2,0.07,0.26,0.1
recall,0.0,0.05,0.12,0.14,0.16,0.15
fscore,0.0,0.06,0.15,0.08,0.16,0.12
count,2.0,34.0,39.0,20.0,54.0,14.0




userinfo_language binary


Unnamed: 0,de,en,it
precision,0.1,0.93,0.05
recall,0.65,0.23,0.05
fscore,0.17,0.35,0.05
count,16.0,138.0,9.0




race binary


Unnamed: 0,Africans,British,EastAs,EastEu,French,Germanic,Hispanic,Indian,Italian,Japanese,Jewish,Muslim,Nordic
precision,0.05,0.15,0.0,0.0,0.0,0.1,0.0,0.5,0.23,0.0,0.2,0.0,0.0
recall,0.2,0.14,0.0,0.0,0.0,0.03,0.0,0.38,0.11,0.0,0.09,0.0,0.0
fscore,0.08,0.14,0.0,0.0,0.0,0.05,0.0,0.42,0.15,0.0,0.12,0.0,0.0
count,5.0,30.0,2.0,13.0,7.0,19.0,6.0,10.0,44.0,2.0,12.0,9.0,4.0




Gender binary


Unnamed: 0,female,male,not_set
precision,0.33,0.5,0.12
recall,0.45,0.42,0.12
fscore,0.35,0.44,0.07
count,49.0,84.0,30.0




dateofbirth_cat binary


Unnamed: 0,1909-1939,1939-1969,1969-1979,1979-1989,1989-1999,1999-2009
precision,0.0,0.21,0.0,0.0,0.0,0.0
recall,0.0,1.0,0.0,0.0,0.0,0.0
fscore,0.0,0.34,0.0,0.0,0.0,0.0
count,2.0,34.0,39.0,20.0,54.0,14.0






In [140]:
import math

def get_mean_std(df, neighbors, cat_cols, label_col, mode='unary'):
    data_dict = {}
    if mode == 'binary':
        for column_name in combinations(cat_cols, 2):
            column_name_list = list(column_name)
            values_dict = {}
            moments_dict = {}
            unique_values = df[column_name_list].groupby(column_name_list).size().index.tolist()
            zero_series = pd.Series(index=unique_values).fillna(0)
            for value in unique_values:
                values_dict[value] = []
            for i in neighbors:
                value = tuple(df[df[label_col]==i][column_name_list].values[0])
                columns = df[df[label_col].isin(neighbors[i])][column_name_list].groupby(column_name_list).size()
                columns = pd.Series(columns.values / columns.sum(), index=columns.index.tolist())
                if len(columns) > 0:
                    values_dict[value] += [zero_series.add(columns, fill_value=0).values]
            for value in values_dict:
                moments_dict[value] = (np.mean(np.array(values_dict[value]), axis=0), np.std(np.array(values_dict[value]), axis=0))
            data_dict[column_name] = moments_dict
    elif mode == 'unary':
        for column_name in cat_cols:
            values_dict = {}
            moments_dict = {}
            unique_values = df[column_name].unique()
            zero_series = pd.Series(index=unique_values).fillna(0)
            for value in unique_values:
                values_dict[value] = []
            for i in neighbors:
                value = df[df[label_col]==i][column_name].iloc[0]
                column = df[df[label_col].isin(neighbors[i])][column_name]
                if len(column) > 0:
                    values_dict[value] += [zero_series.add(column.value_counts(normalize=True), fill_value=0).values]
            for value in values_dict:
                moments_dict[value] = (np.mean(np.array(values_dict[value]), axis=0), np.std(np.array(values_dict[value]), axis=0))
            data_dict[column_name] = moments_dict
    else:
        raise NotImplementedError()
    return data_dict

In [141]:
label_col = 'userinfo_sso'
G = miner.graph()
cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
X = userinfo[userinfo[label_col].isin(G)]
neighbors = dict((i, list(G.neighbors(i))) for i in G)
num = 1
dfs = {}
for mode in ['unary', 'binary']:
    res = get_mean_std(X, neighbors, cat_cols, label_col, mode=mode)
    for cat, val in res.items():
        values = list(val.values())
        N = len(val.keys())
        metric_mat = np.zeros((N, N))
        for i in range(N):
            for j in range(N):
                metric_mat[i, j] = np.linalg.norm(values[i][0] - values[j][0], 1) - num * np.linalg.norm(values[i][1], 1) if i!=j else 1.
        dfs[cat] = pd.DataFrame(metric_mat, index=val.keys(), columns=[str(key) for key in val.keys()]).round(2)
        dfs[cat].to_csv(dir_join(savefolder_unary_res if mode=='unary' else savefolder_binary_res, '.'.join(['metric', str(cat), mode, 'csv'])))
        print(' '.join([str(cat), mode]))
        ICD.display(dfs[cat])
        print()
        print()

userinfo_language unary


Unnamed: 0,en,de,it
en,1.0,-0.63,-0.61
de,-0.57,1.0,-0.35
it,-0.49,-0.3,1.0




race unary


Unnamed: 0,Italian,Germanic,EastAs,British,French,Hispanic,EastEu,Africans,Muslim,Jewish,Nordic,Indian,Japanese
Italian,1.0,-1.08,-0.2,-1.29,-1.0,-1.02,-0.99,-0.67,-1.19,-1.15,-1.12,-1.22,-1.23
Germanic,-1.37,1.0,-0.66,-1.51,-1.36,-1.1,-1.12,-1.17,-1.19,-1.21,-1.35,-1.42,-1.32
EastAs,0.66,0.48,1.0,0.53,0.58,0.79,0.7,0.37,0.58,0.77,0.71,0.42,0.52
British,-1.18,-1.12,-0.22,1.0,-0.93,-0.78,-0.85,-0.69,-1.05,-0.94,-1.1,-1.07,-1.07
French,-0.92,-1.01,-0.21,-0.97,1.0,-0.76,-0.86,-0.56,-0.66,-1.04,-1.04,-0.88,-0.86
Hispanic,-1.04,-0.83,-0.08,-0.9,-0.84,1.0,-0.81,-0.36,-0.79,-0.9,-0.9,-0.92,-0.87
EastEu,-0.93,-0.78,-0.1,-0.9,-0.88,-0.73,1.0,-0.84,-0.82,-0.8,-0.85,-0.76,-0.88
Africans,-0.28,-0.5,-0.1,-0.41,-0.24,0.05,-0.5,1.0,-0.29,-0.14,-0.32,-0.32,-0.43
Muslim,-0.82,-0.53,0.1,-0.78,-0.36,-0.4,-0.51,-0.31,1.0,-0.54,-0.57,-0.7,-0.79
Jewish,-0.89,-0.67,0.17,-0.8,-0.86,-0.62,-0.6,-0.28,-0.66,1.0,-0.7,-0.77,-0.75




Gender unary


Unnamed: 0,male,not_set,female
male,1.0,-0.7,-0.61
not_set,-0.58,1.0,-0.43
female,-0.54,-0.47,1.0




dateofbirth_cat unary


Unnamed: 0,1979-1989,1969-1979,1909-1939,1939-1969,1989-1999,1999-2009
1979-1989,1.0,-0.54,0.02,-0.58,-0.5,-0.53
1969-1979,-0.79,1.0,-0.45,-1.06,-0.99,-0.94
1909-1939,-0.09,-0.32,1.0,-0.31,-0.3,-0.4
1939-1969,-0.55,-0.78,-0.17,1.0,-0.64,-0.67
1989-1999,-0.84,-1.08,-0.53,-1.01,1.0,-0.99
1999-2009,-0.39,-0.55,-0.14,-0.55,-0.51,1.0




('userinfo_language', 'race') binary


Unnamed: 0,"('de', 'Africans')","('de', 'British')","('de', 'French')","('de', 'Germanic')","('de', 'Jewish')","('de', 'Muslim')","('en', 'Africans')","('en', 'British')","('en', 'EastAs')","('en', 'EastEu')",...,"('en', 'Indian')","('en', 'Italian')","('en', 'Japanese')","('en', 'Jewish')","('en', 'Muslim')","('en', 'Nordic')","('it', 'Hispanic')","('it', 'Indian')","('it', 'Italian')","('it', 'Jewish')"
"(de, Africans)",1.0,1.57,1.37,1.21,1.33,1.67,1.75,1.49,1.17,1.76,...,1.37,1.61,1.53,1.5,1.68,1.56,2.0,1.38,1.46,1.56
"(de, British)",0.75,1.0,-0.45,-0.12,0.91,0.28,0.14,-0.44,0.4,-0.14,...,-0.48,-0.45,-0.44,-0.31,-0.32,-0.26,1.12,-0.31,-0.0,0.23
"(de, French)",1.37,0.37,1.0,0.65,1.66,1.04,1.12,0.48,1.38,0.82,...,0.34,0.45,0.43,0.42,0.66,0.56,1.94,0.48,0.71,0.93
"(de, Germanic)",0.01,-0.5,-0.55,1.0,0.6,-0.19,0.11,-0.37,-0.18,-0.09,...,-0.65,-0.39,-0.48,-0.39,-0.25,-0.27,0.78,-0.52,-0.3,-0.01
"(de, Jewish)",1.33,1.72,1.66,1.8,1.0,1.67,2.0,1.63,1.83,1.63,...,1.69,1.54,1.74,1.55,1.74,1.66,2.0,1.66,1.83,1.78
"(de, Muslim)",1.67,1.1,1.04,1.02,1.67,1.0,1.29,1.13,1.5,1.24,...,0.96,0.95,1.03,0.91,1.14,1.17,2.0,0.99,0.97,1.33
"(en, Africans)",0.61,-0.18,-0.01,0.18,0.86,0.16,1.0,-0.2,0.32,-0.19,...,-0.1,-0.05,-0.17,-0.01,-0.17,-0.18,0.86,-0.08,-0.05,0.3
"(en, British)",-0.35,-1.46,-1.37,-1.0,-0.21,-0.71,-0.91,1.0,-0.55,-1.19,...,-1.39,-1.47,-1.34,-1.35,-1.34,-1.33,0.13,-1.29,-1.05,-0.8
"(en, EastAs)",0.33,0.39,0.54,0.19,1.0,0.67,0.62,0.46,1.0,0.71,...,0.35,0.57,0.49,0.6,0.45,0.56,1.17,0.38,0.58,0.83
"(en, EastEu)",0.13,-0.96,-0.81,-0.52,-0.0,-0.39,-0.68,-0.98,-0.09,1.0,...,-0.9,-1.01,-0.89,-0.99,-0.81,-1.01,0.32,-0.92,-0.9,-0.44




('userinfo_language', 'Gender') binary


Unnamed: 0,"('de', 'female')","('de', 'male')","('de', 'not_set')","('en', 'female')","('en', 'male')","('en', 'not_set')","('it', 'female')","('it', 'male')"
"(de, female)",1.0,-0.02,0.37,-0.26,-0.39,-0.18,0.24,-0.26
"(de, male)",-0.11,1.0,0.32,-0.19,-0.21,-0.35,0.37,-0.21
"(de, not_set)",0.14,0.17,1.0,0.09,0.08,0.21,0.34,0.26
"(en, female)",-0.98,-0.83,-0.41,1.0,-1.06,-0.93,-0.51,-0.99
"(en, male)",-1.11,-0.84,-0.4,-1.05,1.0,-0.93,-0.48,-0.98
"(en, not_set)",-0.71,-0.8,-0.09,-0.74,-0.75,1.0,-0.17,-0.76
"(it, female)",0.02,0.23,0.34,-0.0,0.01,0.14,1.0,0.02
"(it, male)",-0.7,-0.57,0.04,-0.7,-0.7,-0.66,-0.2,1.0




('userinfo_language', 'dateofbirth_cat') binary


Unnamed: 0,"('de', '1939-1969')","('de', '1969-1979')","('de', '1989-1999')","('de', '1999-2009')","('en', '1909-1939')","('en', '1939-1969')","('en', '1969-1979')","('en', '1979-1989')","('en', '1989-1999')","('en', '1999-2009')","('it', '1939-1969')","('it', '1969-1979')","('it', '1979-1989')","('it', '1989-1999')"
"(de, 1939-1969)",1.0,-0.26,-0.2,0.15,0.08,-0.29,-0.34,-0.11,-0.2,-0.24,-0.21,-0.14,0.22,0.07
"(de, 1969-1979)",-0.11,1.0,0.01,0.32,0.31,-0.14,-0.14,-0.08,-0.06,-0.14,-0.11,-0.15,0.32,0.27
"(de, 1989-1999)",-0.53,-0.47,1.0,0.13,-0.44,-0.43,-0.54,-0.26,-0.46,-0.46,-0.7,-0.35,-0.02,-0.06
"(de, 1999-2009)",0.9,0.92,1.21,1.0,1.0,0.73,0.83,0.95,0.69,0.71,1.17,0.92,0.78,1.1
"(en, 1909-1939)",-0.17,-0.1,-0.36,0.0,1.0,-0.02,-0.06,0.16,0.01,-0.27,-0.12,0.11,0.0,0.3
"(en, 1939-1969)",-0.76,-0.76,-0.56,-0.48,-0.23,1.0,-1.01,-0.87,-0.92,-0.91,-0.58,-0.8,-0.56,-0.56
"(en, 1969-1979)",-1.31,-1.27,-1.18,-0.89,-0.78,-1.52,1.0,-1.32,-1.49,-1.38,-1.18,-1.2,-0.93,-0.98
"(en, 1979-1989)",-0.62,-0.74,-0.44,-0.3,-0.09,-0.91,-0.85,1.0,-0.75,-0.76,-0.56,-0.81,-0.61,-0.76
"(en, 1989-1999)",-1.16,-1.17,-1.08,-1.01,-0.7,-1.41,-1.46,-1.2,1.0,-1.28,-1.07,-1.07,-0.95,-0.95
"(en, 1999-2009)",-0.52,-0.58,-0.41,-0.32,-0.3,-0.72,-0.69,-0.54,-0.6,1.0,-0.38,-0.47,-0.45,-0.28




('race', 'Gender') binary


Unnamed: 0,"('Africans', 'female')","('Africans', 'not_set')","('British', 'female')","('British', 'male')","('British', 'not_set')","('EastAs', 'not_set')","('EastEu', 'female')","('EastEu', 'male')","('EastEu', 'not_set')","('French', 'female')",...,"('Japanese', 'male')","('Japanese', 'not_set')","('Jewish', 'female')","('Jewish', 'male')","('Jewish', 'not_set')","('Muslim', 'female')","('Muslim', 'male')","('Muslim', 'not_set')","('Nordic', 'male')","('Nordic', 'not_set')"
"(Africans, female)",1.0,-0.25,-0.44,-0.41,-0.26,-0.21,-0.02,-0.49,-0.6,0.02,...,-0.41,-0.34,-0.56,-0.33,0.5,0.5,-0.43,-0.54,-0.45,-0.33
"(Africans, not_set)",1.25,1.0,1.36,1.69,1.69,1.0,1.5,1.39,1.54,1.8,...,1.55,1.44,1.45,1.66,2.0,2.0,1.67,1.54,1.52,1.65
"(British, female)",-0.88,-0.58,1.0,-1.09,-0.84,-0.51,-0.85,-0.67,-0.94,-0.73,...,-1.38,-1.18,-1.26,-1.01,-0.12,-0.0,-1.06,-1.31,-1.38,-1.28
"(British, male)",-0.67,-0.07,-0.92,1.0,-0.86,-0.33,-0.79,-0.65,-0.76,-0.6,...,-0.89,-0.66,-1.12,-0.78,0.16,0.17,-1.16,-1.32,-1.19,-0.95
"(British, not_set)",-0.24,0.22,-0.37,-0.57,1.0,0.3,-0.29,-0.5,-0.47,-0.19,...,-0.32,-0.14,-0.33,-0.4,0.27,0.52,-0.83,-0.51,-0.45,-0.47
"(EastAs, not_set)",0.46,0.17,0.6,0.6,0.93,1.0,1.02,0.38,0.9,0.85,...,0.8,0.56,0.8,0.92,1.17,1.0,0.72,0.62,0.64,0.65
"(EastEu, female)",-0.36,-0.33,-0.74,-0.86,-0.65,0.02,1.0,-0.26,-0.69,-0.62,...,-0.73,-0.68,-0.97,-0.89,-0.08,0.12,-0.64,-0.8,-0.99,-0.56
"(EastEu, male)",-0.32,0.06,-0.06,-0.22,-0.35,-0.11,0.24,1.0,-0.23,0.18,...,-0.14,0.02,-0.05,0.11,0.63,0.66,-0.3,-0.29,-0.12,-0.29
"(EastEu, not_set)",-0.06,0.58,0.04,0.04,0.04,0.78,0.17,0.13,1.0,0.46,...,0.0,0.17,-0.05,-0.12,0.99,1.04,-0.13,-0.09,-0.16,0.1
"(French, female)",0.54,0.83,0.23,0.19,0.31,0.71,0.24,0.53,0.45,1.0,...,0.09,0.29,0.17,0.22,0.0,1.03,0.18,0.1,0.23,0.14




('race', 'dateofbirth_cat') binary


Unnamed: 0,"('Africans', '1969-1979')","('Africans', '1979-1989')","('Africans', '1989-1999')","('British', '1909-1939')","('British', '1939-1969')","('British', '1969-1979')","('British', '1979-1989')","('British', '1989-1999')","('British', '1999-2009')","('EastAs', '1969-1979')",...,"('Jewish', '1969-1979')","('Jewish', '1989-1999')","('Jewish', '1999-2009')","('Muslim', '1969-1979')","('Muslim', '1979-1989')","('Muslim', '1989-1999')","('Muslim', '1999-2009')","('Nordic', '1969-1979')","('Nordic', '1979-1989')","('Nordic', '1999-2009')"
"(Africans, 1969-1979)",1.0,1.5,1.5,1.5,1.09,1.21,1.34,1.04,1.08,0.5,...,1.2,1.37,1.25,1.25,0.75,1.15,1.17,1.05,1.5,1.02
"(Africans, 1979-1989)",2.0,1.0,2.0,2.0,1.63,1.68,1.59,1.79,1.95,2.0,...,1.61,1.77,1.4,1.88,2.0,1.68,1.75,1.65,1.5,1.81
"(Africans, 1989-1999)",1.0,1.0,1.0,0.33,0.69,0.25,0.7,0.48,0.3,1.0,...,0.26,0.54,0.8,0.42,0.5,0.78,0.63,0.59,0.67,0.43
"(British, 1909-1939)",2.0,2.0,1.33,1.0,1.7,1.47,1.31,1.82,1.35,2.0,...,1.4,1.82,1.85,1.48,1.5,1.77,1.71,1.88,2.0,1.71
"(British, 1939-1969)",0.16,0.2,0.26,0.27,1.0,-0.48,-0.32,-0.47,-0.12,0.44,...,-0.49,-0.29,-0.18,-0.38,0.09,-0.48,-0.61,-0.45,0.28,-0.37
"(British, 1969-1979)",-0.01,-0.05,-0.47,-0.25,-0.77,1.0,-0.58,-0.95,-0.49,0.15,...,-0.71,-0.53,-0.44,-0.59,-0.22,-0.64,-0.73,-0.74,-0.32,-0.81
"(British, 1979-1989)",0.44,0.19,0.3,-0.09,-0.29,-0.26,1.0,-0.25,-0.04,0.55,...,-0.24,-0.2,-0.29,-0.07,0.25,-0.55,-0.54,-0.19,0.31,-0.26
"(British, 1989-1999)",0.2,0.45,0.14,0.48,-0.38,-0.57,-0.19,1.0,0.11,0.41,...,-0.09,-0.2,-0.37,-0.11,0.26,-0.23,-0.48,-0.46,-0.02,-0.36
"(British, 1999-2009)",0.22,0.58,-0.07,-0.02,-0.06,-0.13,-0.01,0.08,1.0,0.3,...,-0.37,0.2,0.08,-0.22,-0.2,-0.29,-0.18,-0.02,0.58,-0.45
"(EastAs, 1969-1979)",1.0,2.0,2.0,2.0,1.87,1.88,1.94,1.75,1.67,1.0,...,2.0,2.0,1.9,1.94,1.5,1.95,1.83,1.61,2.0,1.81




('Gender', 'dateofbirth_cat') binary


Unnamed: 0,"('female', '1909-1939')","('female', '1939-1969')","('female', '1969-1979')","('female', '1979-1989')","('female', '1989-1999')","('female', '1999-2009')","('male', '1909-1939')","('male', '1939-1969')","('male', '1969-1979')","('male', '1979-1989')","('male', '1989-1999')","('male', '1999-2009')","('not_set', '1939-1969')","('not_set', '1969-1979')","('not_set', '1979-1989')","('not_set', '1989-1999')","('not_set', '1999-2009')"
"(female, 1909-1939)",1.0,1.15,1.12,1.24,0.95,1.28,2.0,1.19,1.28,1.03,1.31,0.88,1.41,1.33,1.11,1.28,1.14
"(female, 1939-1969)",0.38,1.0,-0.31,-0.24,-0.3,-0.27,1.01,-0.31,-0.37,-0.15,-0.47,-0.06,-0.32,-0.21,0.18,-0.31,-0.36
"(female, 1969-1979)",-0.84,-1.5,1.0,-1.33,-1.39,-1.39,-0.07,-1.35,-1.31,-1.33,-1.41,-1.16,-1.34,-1.25,-0.84,-1.36,-1.28
"(female, 1979-1989)",0.52,-0.17,-0.08,1.0,-0.15,-0.23,0.93,-0.32,-0.29,-0.32,-0.18,-0.03,-0.07,-0.22,0.39,0.03,-0.15
"(female, 1989-1999)",-0.86,-1.34,-1.24,-1.25,1.0,-1.1,0.05,-1.45,-1.29,-1.26,-1.2,-1.13,-1.16,-1.21,-0.8,-1.17,-1.35
"(female, 1999-2009)",0.39,-0.38,-0.32,-0.41,-0.18,1.0,0.61,-0.31,-0.48,-0.2,-0.39,-0.01,-0.22,-0.21,0.18,-0.16,-0.15
"(male, 1909-1939)",2.0,1.79,1.89,1.65,1.87,1.49,1.0,1.8,1.63,1.83,1.74,1.98,1.9,1.69,2.0,1.9,1.82
"(male, 1939-1969)",-0.25,-0.96,-0.82,-1.04,-1.07,-0.86,0.36,1.0,-0.95,-0.9,-0.96,-0.72,-0.93,-0.96,-0.28,-0.75,-0.98
"(male, 1969-1979)",-0.17,-1.05,-0.8,-1.04,-0.93,-1.04,0.18,-0.97,1.0,-0.77,-0.99,-0.59,-0.93,-0.98,-0.47,-0.82,-1.04
"(male, 1979-1989)",-0.66,-1.06,-1.06,-1.3,-1.14,-1.01,0.14,-1.16,-1.01,1.0,-1.02,-0.95,-0.94,-0.92,-0.59,-0.96,-0.95




