In [1]:
import pandas as pd

In [2]:
df = pd.read_table('user-ct-test-collection-02.txt')

In [4]:
# make columns in a lower case. Mixed cases are hard to type. '_' added to query for it collides with pandas method 'query.'
df.columns = ['anonid', 'query_', 'querytime', 'itemrank', 'clickurl']

In [5]:
df.querytime = pd.to_datetime(df.querytime)

In [6]:
df.head(5)

Unnamed: 0,anonid,query_,querytime,itemrank,clickurl
0,479,family guy,2006-03-01 16:01:20,,
1,479,also sprach zarathustra,2006-03-02 14:48:55,,
2,479,family guy movie references,2006-03-03 22:37:46,1.0,http://www.familyguyfiles.com
3,479,top grossing movies of all time,2006-03-03 22:42:42,1.0,http://movieweb.com
4,479,top grossing movies of all time,2006-03-03 22:42:42,2.0,http://www.imdb.com


In [21]:
def get_tally(x):
    ret = dict()
    x_query = x.query_.fillna('')
    ret['n_query'] = len(x)
    q_len = x_query.apply(lambda y: len(y))
    ret['query_len_sum'] = q_len.sum()
    ret['query_len_mean'] = q_len.mean()
    q_word_len = x_query.apply(lambda y: len(y.split(' ')))
    ret['query_word_sum'] = q_word_len.sum()
    ret['query_word_mean'] = q_word_len.mean()
    ret['time_span'] = (x.querytime.max() - x.querytime.min()).seconds
    ret['weekday_min'] = x.querytime.min().weekday()
    ret['day_min'] = x.querytime.min().day
    ret['month_min'] = x.querytime.min().month
    ret['weekday_max'] = x.querytime.max().weekday()
    ret['day_max'] = x.querytime.max().day
    ret['month_max'] = x.querytime.max().month
    return pd.Series(ret)

df_grp = df.groupby('anonid').apply(get_tally).reset_index()

In [30]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [24]:
anonid = df_grp.anonid
df_grp = df_grp.drop('anonid', axis=1)

In [31]:
# if the scale of each feature is not even, the shape of the cluster will be deformed 
df_grp_ss = StandardScaler().fit_transform(df_grp)

In [32]:
cluster_id = KMeans(n_clusters=5).fit_predict(df_grp)

In [35]:
pd.Series(cluster_id).value_counts()

0    22420
1    14416
4    11420
3     8942
2     8802
dtype: int64

In [36]:
pd.DataFrame({'anonid':anonid, 'cluster_id':cluster_id})

Unnamed: 0,anonid,cluster_id
0,479,2
1,507,1
2,946,0
3,1020,0
4,1021,4
...,...,...
65995,24968272,0
65996,24968286,0
65997,24968768,0
65998,24969002,0
