In [2]:
import itertools

import pandas as pd
import nimfa
import scipy
from timeit import default_timer as timer

from sklearn.cluster import KMeans

from hawkes.cluster import FormatAndSplit, ToSparse, GetClosestCustomerCluster
from hawkes.utils import GetTimeSeriesFromDF, GetInfluenceMatrix
from hawkes.hawkes import ExpectationMaximization, Q

%pylab inline
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


In [3]:
df = pd.read_csv('/Users/arnaud/cellule/data/bnpp/ETSAnonymousPricesFull.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
train_df, train_coo, test_df, test_coo = FormatAndSplit(df, cut_date=pd.to_datetime('20131231', format='%Y%m%d'))

In [22]:
ranks = [2, 3, 4, 5, 6, 7, 8, 9, 10]

def plop(rank, ntries=5):
    
    print rank
    scores = []
    for i in range(ntries):

        #### TRAIN

        # Clustering model
        cust = GetClosestCustomerCluster('GAZPRU', train_coo, train_df, rank=rank)
        print "Cluster closest to GAZPRU: %s" % cust

        idx = [c in cust for c in train_df['Customer']]
        dd = train_df.ix[idx, :]
        print "Train set after filtering: %s" % str(dd.shape)

        # Leader-Follower model
        train_time, train_mark = GetTimeSeriesFromDF(dd)
        mu, a, b, p = ExpectationMaximization(train_time, niter=100)

        customer_ids = train_mark["Customer"].values
        g, unique_ids = GetInfluenceMatrix(p, customer_ids)

        #### TEST

        idx = [c in cust for c in test_df['Customer']]
        dd = test_df.ix[idx, :]
        print "Test set after filtering: %s" % str(dd.shape)

        test_time, test_mark = GetTimeSeriesFromDF(dd)

        gg = g.copy()
        threshold = 0.01
        gg[gg < threshold] = 0
        print "Number of strong inluencers: %s" % sum(gg > 0)

        influencer_dict = dict((unique_ids[i], unique_ids[l]) for i, l in enumerate(scipy.sparse.lil_matrix(gg).rows) if l != [])

        customer_ids_test = test_mark["Customer"].values

        tpr = 0
        fpr = 0
        k = 100
        m = 0
        for influencer, influencee in influencer_dict.iteritems():
            for i in np.where(customer_ids_test == influencer)[0]:
                m += 1
                tpr += sum(j in influencee for j in customer_ids_test[i: i + k]) / float(k)
                fpr += (k - sum(j in influencee for j in customer_ids_test[i: i + k])) / float(k)

        tpr /= m
        fpr /= m
        scores.append(tpr)
    return scores

In [23]:
from joblib import Parallel, delayed

In [None]:
Parallel(n_jobs=5)(plop(rank=i) for i in range(2, 10))

Cluster closest to GAZPRU: set([577, 674, 1284, 906, 737, 268, 1002, 492, 205, 14, 612, 212, 698, 482, 932, 314, 763, 957, 1028, 415])
Train set after filtering: (5614, 58)
Test set after filtering: (9141, 58)
Number of strong inluencers: 1
Cluster closest to GAZPRU: set([577, 674, 1028, 815, 673, 1002, 875, 492, 482, 14, 751, 612, 1170, 275, 212, 398, 1284, 698, 932, 906])
Train set after filtering: (4649, 58)
Test set after filtering: (8526, 58)
Number of strong inluencers: 3
Cluster closest to GAZPRU: set([673, 259, 1028, 1061, 1284, 577, 268, 1002, 875, 492, 205, 14, 815, 1170, 212, 612, 698, 763, 906, 751])
Train set after filtering: (5211, 58)
Test set after filtering: (9057, 58)
Number of strong inluencers: 6
Cluster closest to GAZPRU: set([577, 674, 1028, 673, 314, 1002, 492, 482, 14, 751, 612, 1170, 275, 212, 398, 1284, 698, 932, 957, 415])
Train set after filtering: (5190, 58)
Test set after filtering: (8869, 58)
Number of strong inluencers: 1
Cluster closest to GAZPRU: set([