In [1]:
from os.path import join as dir_join
from os.path import exists as dir_exists
from os import makedirs, listdir
import re
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time
from itertools import combinations
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from IPython.core import display as ICD

from utills import Candidate, Platoon
from pattern_miner import Miner

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

def get_trajectory_id(text):
    m = re.search('client_(.+).csv', text)
    if m:
        found = m.group(1)
        return found
    else:
        raise ValueError()

TRAJ_FOLDER = 'paths'
columns = ['lat', 'long', 'datetime', 'trajectory_id']
FILE_NAME = dir_join(TRAJ_FOLDER, 'processed.csv')

if not dir_exists(FILE_NAME):
    if not dir_exists(TRAJ_FOLDER):
        raise ValueError(TRAJ_FOLDER + ' does not exist')
    folder_files = istdir(TRAJ_FOLDER)
    list_df = []
    for filename in folder_files:
        df = pd.read_csv(dir_join(TRAJ_FOLDER, filename), names=columns)
        df['datetime'] = pd.to_datetime(df['datetime'], unit='s')
        df['trajectory_id'] = get_trajectory_id(filename)
        list_df += [df]
    df = pd.concat(list_df, ignore_index=True)
    df.to_csv(FILE_NAME, index=False)

df = pd.read_csv(FILE_NAME, parse_dates=[columns.index('datetime')], dtype={'lat': np.float32, 'long': np.float32, 'trajectory_id': np.str_})
df.head()

Unnamed: 0,lat,long,datetime,trajectory_id
0,359.5,416.5,2017-12-21 13:01:42,4976
1,359.5,416.5,2017-12-21 13:01:49,4976
2,359.5,416.5,2017-12-21 13:01:54,4976
3,364.5,426.5,2017-12-21 13:01:59,4976
4,379.5,456.5,2017-12-21 13:02:06,4976


In [2]:
savefolder = 'results_large'
if not dir_exists(savefolder):
    makedirs(savefolder)

sampling_interval = pd.Timedelta(minutes=1)
split_border = pd.Timedelta(days=1)
max_time_interval = df['datetime'].max()-df['datetime'].min()
pl = Platoon(2, 60, 5, max_time_interval // sampling_interval)
miner = Miner(df, pl, sampling_interval)

time1 = time.time()
print('Start time: ' + time.ctime())
#miner.extract_staypoints_heatmap(10)
#miner.save_staypoints_heatmap(dir_join(savefolder, 'staypoints_heatmap.npy'))
miner.load_staypoints_heatmap(dir_join(savefolder, 'staypoints_heatmap.npy'))
time2 = time.time()
print('Staypoints extraction done. Time: ' + str(time2 - time1))
miner.unify_datetime(split_border)
time3 = time.time()
print('Data unification done. Time: ' + str(time3 - time2))
#miner.compute_candidate_stars(5)
#miner.save_candidate_stars(dir_join(savefolder, 'candidate_stars.json'))
miner.load_candidate_stars(dir_join(savefolder, 'candidate_stars.json'))
time4 = time.time()
print('Candidate stars computing done. Time: ' + str(time4 - time3))
miner.compute_pattern_set()
time5 = time.time()
print('Pattern set computing done. Time: ' + str(time5 - time4))
#miner.compute_connection_rate()
#miner.save_connection_rate(dir_join(savefolder, 'connection_rate.npz'))
miner.load_connection_rate(dir_join(savefolder, 'connection_rate.npz'))
time6 = time.time()
print('Connection rate computing done. Time: ' + str(time6 - time5))
print('All time: ' + str(time6 - time1))
print('Finish time: ' + time.ctime())

Start time: Mon May  7 12:14:32 2018
Staypoints extraction done. Time: 0.011429309844970703
Data unification done. Time: 198.77884316444397
Candidate stars computing done. Time: 0.14970827102661133
Pattern set computing done. Time: 7.496948957443237
Connection rate computing done. Time: 0.004793405532836914
All time: 206.44172310829163
Finish time: Mon May  7 12:17:59 2018


In [3]:
userinfo = pd.read_csv('user_info.csv', parse_dates=['userinfo_dateofbirth'], dtype={'userinfo_sso': np.str_}).fillna('not_set')
bins = pd.date_range(userinfo['userinfo_dateofbirth'].min() - pd.Timedelta(days=365), pd.to_datetime('today'), freq='10Y')
bins = bins[[0,3]+list(range(6,len(bins)))] # drop redundant
userinfo['dateofbirth_cat'] = pd.cut(userinfo['userinfo_dateofbirth'], bins, labels=[str(bins[i-1].year)+'-'+str(bins[i].year) for i in range(1, len(bins))])

race_cut = {}
race_cut['GreaterEuropean,WestEuropean,Italian'] = 'Italian'
race_cut['GreaterEuropean,WestEuropean,Germanic'] = 'Germanic'
race_cut['GreaterEuropean,WestEuropean,Nordic'] = 'Nordic'
race_cut['GreaterEuropean,British'] = 'British'
race_cut['GreaterAfrican,Africans'] = 'Africans'
race_cut['Asian,GreaterEastAsian,EastAsian'] = 'EastAs'
race_cut['GreaterEuropean,WestEuropean,French'] = 'French'
race_cut['GreaterEuropean,EastEuropean'] = 'EastEu'
race_cut['GreaterEuropean,WestEuropean,Hispanic'] = 'Hispanic'
race_cut['GreaterAfrican,Muslim'] = 'Muslim'
race_cut['GreaterEuropean,Jewish'] = 'Jewish'
race_cut['Asian,IndianSubContinent'] = 'Indian'
race_cut['Asian,GreaterEastAsian,Japanese'] = 'Japanese'
userinfo['race'] = userinfo['race'].apply(lambda x: race_cut[x])

userinfo.head()

Unnamed: 0,userinfo_sso,userinfo_dateofbirth,userinfo_language,race,Gender,dateofbirth_cat
0,1,1970-01-01,de,Italian,male,1969-1979
1,5,1984-11-01,en,Italian,male,1979-1989
2,13,1970-01-01,de,Germanic,male,1969-1979
3,25,1961-09-23,en,Italian,female,1939-1969
4,28,1965-10-15,en,Germanic,male,1939-1969


In [4]:
savefolder_userinfo = dir_join(savefolder, 'userinfo_hist')
if not dir_exists(savefolder_userinfo):
    makedirs(savefolder_userinfo)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in cat_cols:
    for i in G:
        column = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name]
        pd.Series(dict((y, x) for x, y in np.array(np.unique(column, return_counts=True))[::-1].T)).plot(kind='bar')
        plt.title(' '.join([column_name, 'Person', str(i)]))
        plt.tight_layout()
        plt.savefig(dir_join(savefolder_userinfo, '.'.join([str(i), column_name, 'png'])))
        plt.clf()
    column = userinfo[column_name]
    pd.Series(dict((y, x) for x, y in np.array(np.unique(column, return_counts=True))[::-1].T)).plot(kind='bar')
    plt.title(' '.join([column_name, 'All']))
    plt.tight_layout()
    plt.savefig(dir_join(savefolder_userinfo, '.'.join([column_name, 'All', 'png'])))
    plt.clf()

<matplotlib.figure.Figure at 0x7f297aed90f0>

In [5]:
savefolder_unary = dir_join(savefolder, 'unary_hist')
if not dir_exists(savefolder_unary):
    makedirs(savefolder_unary)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in cat_cols:
    values_dict = {}
    for value in userinfo[column_name].unique():
        values_dict[value] = pd.Series()
    for i in G:
        value = userinfo[userinfo['userinfo_sso']==i][column_name].iloc[0]
        column = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name]
        if(len(column) > 0):
            values_dict[value] = values_dict[value].add(column.value_counts(normalize=True), fill_value=0)
    for value in values_dict:
        if(len(values_dict[value]) > 0):
            values_dict[value].plot(kind='bar')
            plt.title(' '.join([column_name, value]))
            plt.tight_layout()
            plt.savefig(dir_join(savefolder_unary, '.'.join([column_name, value, 'png'])))
            plt.clf()
        else:
            print(' '.join(['Value', value, 'skipped']))

<matplotlib.figure.Figure at 0x7f297bc8afd0>

In [6]:
savefolder_binary = dir_join(savefolder, 'binary_hist')
if not dir_exists(savefolder_binary):
    makedirs(savefolder_binary)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in combinations(cat_cols, 2):
    column_name_list = list(column_name)
    values_dict = {}
    for value in userinfo[column_name_list].groupby(column_name_list).count().reset_index().values:
        values_dict[tuple(value)] = pd.Series()
    for i in G:
        value = tuple(userinfo[userinfo['userinfo_sso']==i][column_name_list].values[0])
        columns = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name_list].groupby(column_name_list).size()
        columns /= columns.sum()
        if(len(columns) > 0):
            values_dict[value] = values_dict[value].add(columns, fill_value=0) if len(values_dict[value]) > 0 else columns
    for value in values_dict:
        if(len(values_dict[value]) > 0):
            values_dict[value].plot(kind='bar')
            plt.title(' '.join([' '.join(column_name), ' '.join(value)]))
            plt.tight_layout()
            plt.savefig(dir_join(savefolder_binary, '.'.join(['_'.join(column_name), '_'.join(value), 'png'])))
            plt.clf()
        else:
            print(' '.join(['Value', ' '.join(value), 'skipped']))

Value de Italian skipped
Value de Nordic skipped
Value it EastEu skipped
Value it French skipped
Value it not_set skipped
Value de 1909-1939 skipped
Value de 1979-1989 skipped
Value it 1909-1939 skipped
Value it 1999-2009 skipped
Value Africans male skipped
Value EastAs male skipped
Value Hispanic female skipped
Value Nordic female skipped
Value Africans 1909-1939 skipped
Value Africans 1939-1969 skipped
Value Africans 1999-2009 skipped
Value EastAs 1909-1939 skipped
Value EastAs 1939-1969 skipped
Value EastAs 1989-1999 skipped
Value EastAs 1999-2009 skipped
Value EastEu 1909-1939 skipped
Value EastEu 1979-1989 skipped
Value EastEu 1999-2009 skipped
Value French 1909-1939 skipped
Value French 1999-2009 skipped
Value Hispanic 1909-1939 skipped
Value Hispanic 1979-1989 skipped
Value Hispanic 1999-2009 skipped
Value Indian 1909-1939 skipped
Value Indian 1939-1969 skipped
Value Indian 1979-1989 skipped
Value Italian 1909-1939 skipped
Value Japanese 1909-1939 skipped
Value Japanese 1939-196

<matplotlib.figure.Figure at 0x7f297bb8f780>

In [74]:
def get_common_data(df, neighbors, cat_cols, label_col, mode='unary'):
    data_dict = {}
    if mode == 'binary':
        for column_name in combinations(cat_cols, 2):
            column_name_list = list(column_name)
            values_dict = {}
            unique_values = df[column_name_list].groupby(column_name_list).count().reset_index().values
            for value in unique_values:
                values_dict[tuple(value)] = pd.Series(index=unique_values).fillna(0)
            for i in neighbors:
                value = tuple(df[df[label_col]==i][column_name_list].values[0])
                columns = df[df[label_col].isin(list(neighbors[i]))][column_name_list].groupby(column_name_list).size()
                columns = pd.Series(columns.values / columns.sum(), index=columns.index.tolist())
                if len(columns) > 0:
                    values_dict[value] = values_dict[value].add(columns, fill_value=0)
            for value in values_dict:
                values_dict[value] = values_dict[value] / values_dict[value].sum()
            data_dict[column_name] = values_dict
    elif mode == 'unary':
        for column_name in cat_cols:
            values_dict = {}
            unique_values = df[column_name].unique()
            for value in unique_values:
                values_dict[value] = pd.Series(index=unique_values).fillna(0)
            for i in neighbors:
                value = df[df[label_col]==i][column_name].iloc[0]
                column = df[df[label_col].isin(neighbors[i])][column_name]
                if len(column) > 0:
                    values_dict[value] = values_dict[value].add(column.value_counts(normalize=True), fill_value=0)
            for value in values_dict:
                values_dict[value] = values_dict[value] / values_dict[value].sum()
            data_dict[column_name] = values_dict
    else:
        raise NotImplementedError()
    return data_dict

def get_user_data(df, neighbors, cat_cols, label_col, mode='unary'):
    data_dict = {}
    if mode == 'binary':
        for column_name in combinations(cat_cols, 2):
            column_name_list = list(column_name)
            values_dict = {}
            unique_values = df[column_name_list].groupby(column_name_list).count().reset_index().values
            for i in neighbors:
                values_dict[i] = pd.Series(index=unique_values).fillna(0)
                columns = df[df[label_col].isin(neighbors[i])][column_name_list].groupby(column_name_list).size()
                columns = pd.Series(columns.values / columns.sum(), index=columns.index.tolist())
                if len(columns) > 0:
                    values_dict[i] = values_dict[i].add(columns, fill_value=0)
            data_dict[column_name] = values_dict
    elif mode == 'unary':
        for column_name in cat_cols:
            values_dict = {}
            for i in neighbors:
                values_dict[i] = pd.Series(index=df[column_name].unique()).fillna(0)
                column = df[df[label_col].isin(neighbors[i])][column_name]
                if len(column) > 0:
                    values_dict[i] = values_dict[i].add(column.value_counts(normalize=True), fill_value=0)
            data_dict[column_name] = values_dict
    else:
        raise NotImplementedError()
    return data_dict

def test_prediction(df, G, cat_cols, label_col, n_splits, mode='unary'):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    X = df[df[label_col].isin(G)]
    score = dict((cat, []) for cat in cat_cols)
    cat_labels = dict((cat, np.sort(df[cat].unique())) for cat in cat_cols)
    score_labels = ['precision', 'recall', 'fscore', 'count']
    for train_index, test_index in kf.split(X):
        # prepare data, make histograms
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        neighbors_train = dict((i, list(G.neighbors(i))) for i in G if i in X_train[label_col].values)
        neighbors_test = dict((i, list(G.neighbors(i))) for i in G if i in X_test[label_col].values)
        data_train = get_common_data(X_train, neighbors_train, cat_cols, label_col, mode=mode)
        data_test = get_user_data(X_train, neighbors_test, cat_cols, label_col, mode=mode)
        # get true data
        true_dict = dict((col, pd.Series()) for col in cat_cols)
        for i in X_test.iterrows():
            for col in cat_cols:
                true_dict[col][i[1][label_col]] = i[1][col]
        # make a prediction
        pred_dict = {}
        for cat, persons_dict in data_test.items():
            pred_cat_dict = pd.Series()
            for i in persons_dict:
                deviation = [(value, np.linalg.norm(persons_dict[i] - hist, 1)) for value, hist in data_train[cat].items()]
                pred_cat_dict[i] = min(deviation, key=(lambda item:item[1]))[0]
            pred_dict[cat] = pred_cat_dict
        if mode == 'binary':
            pred_dict_final = {}
            for col in cat_cols:
                pred_cat_dict = pd.Series()
                for i in X_test[label_col]:
                    votes = [pred_dict[cat][i][cat.index(col)] for cat in pred_dict if col in cat]
                    pred_cat_dict[i] = max(votes, key=votes.count)
                pred_dict_final[col] = pred_cat_dict
            pred_dict = pred_dict_final
        # compute the metrics
        for cat in score:
            score[cat] += [precision_recall_fscore_support(true_dict[cat], pred_dict[cat], labels=cat_labels[cat])]
    # make metrics prettier
    for cat in score:
        mean_score = np.mean(score[cat], axis=0)
        mean_score[-1] *= n_splits
        score[cat] = pd.DataFrame(mean_score, index=score_labels, columns=cat_labels[cat]).round(2)
    return score

In [75]:
n_splits = 10
label_col = 'userinfo_sso'
G = miner.graph()
cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
score_unary = test_prediction(userinfo, G, cat_cols, label_col, n_splits, mode='unary')
score_binary = test_prediction(userinfo, G, cat_cols, label_col, n_splits, mode='binary')

savefolder_unary_res = dir_join(savefolder, 'unary_res')
if not dir_exists(savefolder_unary_res):
    makedirs(savefolder_unary_res)

for cat, df in score_unary.items():
    df.to_csv(dir_join(savefolder_unary_res, '.'.join([cat, 'unary', 'csv'])))
    print(' '.join([cat, 'unary']))
    ICD.display(df)
    print()
    print()

savefolder_binary_res = dir_join(savefolder, 'binary_res')
if not dir_exists(savefolder_binary_res):
    makedirs(savefolder_binary_res)

for cat, df in score_binary.items():
    df.to_csv(dir_join(savefolder_binary_res, '.'.join([cat, 'binary', 'csv'])))
    print(' '.join([cat, 'binary']))
    ICD.display(df)
    print()
    print()

userinfo_language unary


Unnamed: 0,de,en,it
precision,0.07,0.85,0.05
recall,0.15,0.38,0.27
fscore,0.08,0.5,0.09
count,16.0,138.0,9.0




race unary


Unnamed: 0,Africans,British,EastAs,EastEu,French,Germanic,Hispanic,Indian,Italian,Japanese,Jewish,Muslim,Nordic
precision,0.05,0.08,0.0,0.1,0.0,0.12,0.0,0.1,0.05,0.0,0.05,0.0,0.05
recall,0.1,0.08,0.0,0.05,0.0,0.06,0.0,0.05,0.02,0.0,0.03,0.0,0.05
fscore,0.07,0.07,0.0,0.07,0.0,0.07,0.0,0.07,0.02,0.0,0.04,0.0,0.05
count,5.0,30.0,2.0,13.0,7.0,19.0,6.0,10.0,44.0,2.0,12.0,9.0,4.0




Gender unary


Unnamed: 0,female,male,not_set
precision,0.32,0.47,0.28
recall,0.4,0.28,0.52
fscore,0.32,0.33,0.32
count,49.0,84.0,30.0




dateofbirth_cat unary


Unnamed: 0,1909-1939,1939-1969,1969-1979,1979-1989,1989-1999,1999-2009
precision,0.0,0.08,0.2,0.07,0.26,0.1
recall,0.0,0.05,0.12,0.14,0.16,0.15
fscore,0.0,0.06,0.15,0.08,0.16,0.12
count,2.0,34.0,39.0,20.0,54.0,14.0




userinfo_language binary


Unnamed: 0,de,en,it
precision,0.1,0.93,0.05
recall,0.65,0.23,0.05
fscore,0.17,0.35,0.05
count,16.0,138.0,9.0




race binary


Unnamed: 0,Africans,British,EastAs,EastEu,French,Germanic,Hispanic,Indian,Italian,Japanese,Jewish,Muslim,Nordic
precision,0.05,0.15,0.0,0.0,0.0,0.1,0.0,0.5,0.23,0.0,0.2,0.0,0.0
recall,0.2,0.14,0.0,0.0,0.0,0.03,0.0,0.38,0.11,0.0,0.09,0.0,0.0
fscore,0.08,0.14,0.0,0.0,0.0,0.05,0.0,0.42,0.15,0.0,0.12,0.0,0.0
count,5.0,30.0,2.0,13.0,7.0,19.0,6.0,10.0,44.0,2.0,12.0,9.0,4.0




Gender binary


Unnamed: 0,female,male,not_set
precision,0.29,0.5,0.25
recall,0.3,0.45,0.24
fscore,0.26,0.46,0.24
count,49.0,84.0,30.0




dateofbirth_cat binary


Unnamed: 0,1909-1939,1939-1969,1969-1979,1979-1989,1989-1999,1999-2009
precision,0.01,0.0,0.0,0.0,0.0,0.0
recall,0.2,0.0,0.0,0.0,0.0,0.0
fscore,0.02,0.0,0.0,0.0,0.0,0.0
count,2.0,34.0,39.0,20.0,54.0,14.0




