In [26]:
__author__ = "Dongjie Fan"
import pandas as pd
import numpy as np
from shapely.geometry import Point, MultiPoint
import yaml
import cPickle as pickle
import geopandas as gpd
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.mixture import GaussianMixture




In [2]:
tw = pd.read_csv("./tw_without_duplication.csv")
tw['Tweeted_At'] = pd.to_datetime(tw['Tweeted_At'])

In [3]:
def fetch_hour(dt):
    return dt.map(lambda x: x.hour)

def fetch_weeday(dt):
    return dt.map(lambda x: x.weekday_name)

tw['Hour'] = fetch_hour(tw['Tweeted_At'])
tw['wd'] = fetch_weeday(tw['Tweeted_At'])

In [4]:
print tw.shape[0]
print len(tw['Username'].unique())

473629
69097


<br/><br/><br/><br/>
### tw_work

In [5]:
hour4work = range(12,17) 
print hour4work
weekday4work = ['Tuesday', 'Wednesday', 'Thursday', 'Monday', 'Friday']
print weekday4work

[12, 13, 14, 15, 16]
['Tuesday', 'Wednesday', 'Thursday', 'Monday', 'Friday']


In [6]:
tw_work = tw.copy()
tw_work = tw_work[tw_work['Hour'].isin(hour4work)]
tw_work = tw_work[tw_work['wd'].isin(weekday4work)]
tw_work.reset_index(drop=True, inplace=True)
print "# of Tweets: {}\n% of Tweets: {:.2f}%".format(tw_work.shape[0], 100.0 * tw_work.shape[0]/tw.shape[0])
print "# of Users: {}\n% of Users: {:.2f}%".format(len(tw_work['Username'].unique()), \
                100.0 * len(tw_work['Username'].unique())/len(tw['Username'].unique()))

# of Tweets: 51119
% of Tweets: 10.79%
# of Users: 18113
% of Users: 26.21%


In [7]:
users = tw_work['Username'].unique()

In [8]:
tw_work['latlon'] = tw_work['Geo'].map(lambda x: yaml.load(x)["u'coordinates'"])
tw_work['geometry'] = tw_work['latlon'].map(lambda x: Point(x[0], x[1]))

In [13]:
# import cPickle as pickle
with open("./userLoc_Work.p", 'wb') as f:
    pickle.dump(userLoc_Work, f)

<br/><br/><br/><br/><br/>

### tw_home 

In [14]:
hour4home = range(22,24,1) + range(0,3,1) + range(6,9,1)
print hour4home
weekday4home = ['Tuesday', 'Wednesday', 'Thursday', 'Monday']
print weekday4home

[22, 23, 0, 1, 2, 6, 7, 8]
['Tuesday', 'Wednesday', 'Thursday', 'Monday']


In [15]:
tw_home = tw.copy()
tw_home = tw_home[tw_home['Hour'].isin(hour4home)]
tw_home = tw_home[tw_home['wd'].isin(weekday4home)]
tw_home.reset_index(drop=True, inplace=True)
print "# of Tweets: {}\n% of Tweets: {:.2f}%".format(tw_home.shape[0], 100.0 * tw_home.shape[0]/tw.shape[0])
print "# of Users: {}\n% of Users: {:.2f}%".format(len(tw_home['Username'].unique()), \
                100.0 * len(tw_home['Username'].unique())/len(tw['Username'].unique()))

# of Tweets: 117719
% of Tweets: 24.85%
# of Users: 31337
% of Users: 45.35%


In [16]:
users = tw_home['Username'].unique()

In [17]:
tw_home['latlon'] = tw_home['Geo'].map(lambda x: yaml.load(x)["u'coordinates'"])
tw_home['geometry'] = tw_home['latlon'].map(lambda x: Point(x[0], x[1]))

In [18]:
userLoc_Home = {}
for user in users:
    userLoc_Home[user] = list(tw_home[tw_home['Username'] == user]['geometry'])

In [19]:
# import cPickle as pickle
with open("./userLoc_Home.p", 'wb') as f:
    pickle.dump(userLoc_Home, f)

<br/><br/><br/><br/>

### Function: Gaussian Mixture Clustering

In [24]:
def Location(user):
    X = geo[user]
    # At least sent 4 tweets, else return centroid directly. 
    if len(X) < 4:
        return MultiPoint(X).centroid
    
    maxK = 10
    silhouScore = 0
    # At least ONE cluster
    finalClusters = 1 

    for n_clusters in range(2, min(maxK, len(X)-1)):
        #run the clustering
        gm=GaussianMixture(n_components=n_clusters, random_state=324)
        #assign labels of clusters
        cluster_labels=gm.fit(X).predict(X)
        #calculate average for each cluster
        if len(set(cluster_labels)) != 1:
            silhouette_avg = silhouette_score(X, cluster_labels)
            if silhouette_avg > silhouScore:
                silhouScore = silhouette_avg
                finalClusters = n_clusters


    gm=GaussianMixture(n_components=finalClusters, random_state=324)
    res1=gm.fit(X).predict(X)
    target_cluster = pd.Series(res1).value_counts().argmax()
    label = res1 == target_cluster

    X = pd.Series(X).loc[label]
    XX = list(X)
    return MultiPoint(XX).centroid




### Format & Run

#### Work

In [27]:
geo = {}
for user, pList in userLoc_Work.iteritems():
    X = []
    for p in pList:
        X.append([p.y, p.x])
    geo[user] = X
    
print len(geo.keys())
work = {}
for i, user in enumerate(geo.keys()):
    #print user
    print "\r", 100.0*(i+1)/len(geo.keys()), "%",
    work[user] = Location(user)

18113
100.0 %


#### Home

In [28]:
geo = {}
for user, pList in userLoc_Home.iteritems():
    X = []
    for p in pList:
        X.append([p.y, p.x])
    geo[user] = X
    
print len(geo.keys())
home = {}
for i, user in enumerate(geo.keys()):
    #print user
    print "\r", 100.0*(i+1)/len(geo.keys()), "%",
    home[user] = Location(user)

31337
100.0 %


### Output

In [69]:
df_work = pd.DataFrame.from_dict(work, orient='index')
df_work.rename(columns={0: "Work"}, inplace=True)

df_home = pd.DataFrame.from_dict(home, orient='index')
df_home.rename(columns={0: "Home"}, inplace=True)

In [70]:
df_inner = df_home.join(df_work, how='inner')
print "inner_join"
print "inner", len(df_inner)
print "home", len(df_home)
print "work", len(df_work)

df_home_leftjoin = df_home.join(df_work, how='left')
print "\nhome & work: leftjoin"
print "home_leftjoin", len(df_home_leftjoin)
print "home", len(df_home)
print "work", len(df_work)

inner_join
inner 9326
home 31337
work 18113

home & work: leftjoin
home_leftjoin 31337
home 31337
work 18113


#### inner join

In [71]:
df_inner.head(2)

Unnamed: 0,Home,Work
carlscrush,POINT (-73.97339207333334 40.75688146),POINT (-74.00140622666667 40.73572657666667)
mayataughtme,POINT (-73.79655447499999 40.71864909999999),POINT (-73.7965552 40.7186549)


In [75]:
df_inner['Hx'] = df_inner['Home'].map(lambda p: p.x)
df_inner['Hy'] = df_inner['Home'].map(lambda p: p.y)
df_inner['Wx'] = df_inner['Work'].map(lambda p: p.x)
df_inner['Wy'] = df_inner['Work'].map(lambda p: p.y)
df_inner.head(2)

Unnamed: 0,Home,Work,Hx,Hy,Wx,Wy
carlscrush,POINT (-73.97339207333334 40.75688146),POINT (-74.00140622666667 40.73572657666667),-73.973392,40.756881,-74.001406,40.735727
mayataughtme,POINT (-73.79655447499999 40.71864909999999),POINT (-73.7965552 40.7186549),-73.796554,40.718649,-73.796555,40.718655


In [76]:
df_inner.to_csv("home_work_inner_join.csv")
df_inner.to_pickle("home_work_inner_join.p")

<br/>
#### home left join

In [77]:
df_home_leftjoin.head(2)

Unnamed: 0,Home,Work
lancewhite,POINT (-73.86230676 40.76844627),
julietterichey,POINT (-73.968379035 40.679262225),


In [86]:
df_home_leftjoin['Hx'] = df_home_leftjoin['Home'].map(lambda p: p.x)
df_home_leftjoin['Hy'] = df_home_leftjoin['Home'].map(lambda p: p.y)

def p_x(point):
    try:
        point.x
    except:
        pass
    
def p_y(point):
    try:
        point.y
    except:
        pass
    
df_home_leftjoin['Wx'] = df_home_leftjoin['Work'].map(p_x)
df_home_leftjoin['Wy'] = df_home_leftjoin['Work'].map(p_y)
df_home_leftjoin.head(2)

Unnamed: 0,Home,Work,Hx,Hy,Wx,Wy
lancewhite,POINT (-73.86230676 40.76844627),,-73.862307,40.768446,,
julietterichey,POINT (-73.968379035 40.679262225),,-73.968379,40.679262,,


In [87]:
df_home_leftjoin.to_csv("home_work_left_join.csv")
df_home_leftjoin.to_pickle("home_work_left_join.p")