In [1]:
import pandas as pd

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from sklearn import mixture

In [3]:
import_file = 'userReviewDates.json'
review = pd.read_json(import_file, lines=True)
review = review.sort_values('user_id')

In [4]:
review = review.reset_index()
del review['index']

In [5]:
review

Unnamed: 0,business_id,date,latitude,longitude,review_count,review_id,stars_x,stars_y,user_id,user_latitude,user_longitude,user_city,user_state
0,GT0K4EdSSxe_LMU6SPr-_A,2012-06-09,36.128561,-115.171130,245,MbiEXTlp-rEbK-ZL-WDYew,5,4.5,---1lKK3aKOuomHnwAkAow,,,,
1,d7Jgj1h_MILumtsTlb2aXA,2011-12-11,36.232720,-115.250343,245,X7-Kqkzg2J4rpKJGXvXfUg,5,4.0,---1lKK3aKOuomHnwAkAow,,,,
2,2BbFeotL85cIaBjSq1SWiA,2010-10-17,36.168099,-115.192230,245,YYIAglOQmbto4z3LNv1T8w,1,2.5,---1lKK3aKOuomHnwAkAow,,,,
3,y8d90Pt16Nip-B5UXWBP-w,2011-01-05,36.156796,-115.334387,245,8rTiUXrFD9J10RI4Q2O6Qw,4,4.0,---1lKK3aKOuomHnwAkAow,,,,
4,eduRavkml8awmPachSZXuw,2010-11-09,36.163874,-115.289944,245,IQtxzBq4m-iPJDHYAWOGHg,1,3.5,---1lKK3aKOuomHnwAkAow,,,,
5,rq5dgoksPHkJwJNQKlGQ7w,2010-10-16,36.164180,-115.289630,245,-UtICN8nUQ4g9qIHlQRrxw,5,4.0,---1lKK3aKOuomHnwAkAow,,,,
6,VmssDHtnBjYJQ6cWiwV9kA,2012-09-07,36.159126,-115.193705,245,88D5hXtby96ZlAKPCMhGcw,4,4.5,---1lKK3aKOuomHnwAkAow,,,,
7,bPcqucuuClxYrIM8xWoArg,2011-09-29,36.124840,-115.325222,245,-pk4s5YUD0grEEBt2QYlDA,5,4.0,---1lKK3aKOuomHnwAkAow,,,,
8,78TC3sZSYBzBsSJ0z5pyhw,2010-11-05,36.112501,-115.170579,245,nHYLl06G_Yt8dcRpzCJFiQ,1,2.5,---1lKK3aKOuomHnwAkAow,,,,
9,SfYMEQ2W-u6ixeakITwQ_g,2015-11-30,36.167685,-115.138382,245,Nym3-4o44VF2maR63LdI6w,1,4.0,---1lKK3aKOuomHnwAkAow,,,,


In [6]:
userLoc = review['user_id'].drop_duplicates().to_frame().reset_index()
del userLoc['index']
userLoc['gmm_latitude'] = pd.Series()
userLoc['gmm_longitude'] = pd.Series()
userLoc['gmm_city'] = pd.Series()
userLoc['gmm_state'] = pd.Series()

In [7]:
# Return array of latitudes and array of longitudes corresponding to given user
def user_locations(user):
    user = [user]
    tmp = review.query('user_id == @user').reset_index(drop=True)
    print('User: %s' % user)
    print('Reviews Included: %d' % len(tmp))
    print('Total Reviews: %d' % tmp['review_count'][0])
    return tmp['latitude'], tmp['longitude']

In [8]:
COORDINATE_DIFFERENCE = 0.6
def same_city(a,b):
    if abs(a[0]-b[0])<COORDINATE_DIFFERENCE and abs(a[1]-b[1])<COORDINATE_DIFFERENCE:
        return True
    else:
        return False

In [9]:
def estimate_clusters_number(locations):
    i =1 
    while i < len(locations):
        if same_city(locations[0],locations[i]):
            return i
        i = i+1
    return len(locations)

In [10]:
def gmm(latitudes, longitudes):
    X_train = []
    for i in range(len(latitudes)):
        X_train.append([latitudes[i],longitudes[i]])
    X_train = np.array(X_train)
    
    if len(X_train) > 4:
        components = 4
    else:
        components = len(X_train)
        
    # fit a Gaussian Mixture Model with one component
    clf = mixture.GaussianMixture(n_components=components, covariance_type='full')
    clf.fit(X_train)
    components = estimate_clusters_number(clf.means_)
    clusters = mixture.GaussianMixture(n_components=components, covariance_type='full')  
    clusters.fit(X_train)
    
    return clusters

In [11]:
def scatterplot(X_train, center, clf):
    # display predicted scores by the model as a contour plot
    x = np.linspace(center[0]-0.2, center[0]+0.2)
    y = np.linspace(center[1]-0.2, center[1]+0.2)
    X, Y = np.meshgrid(x, y)
    XX = np.array([X.ravel(), Y.ravel()]).T
    Z = -clf.score_samples(XX)
    Z = Z.reshape(X.shape)

    CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0),
                     levels=np.logspace(0, 3, 10))
    CB = plt.colorbar(CS, shrink=0.8, extend='both')
    plt.scatter(X_train[:, 0], X_train[:, 1], .9)

    plt.title('Businesses Visited by User')
    plt.axis('tight')
    plt.show()

In [12]:
import reverse_geocoder as rg
def locate(latitudes, longitudes):
    # create training set
    X_train = []
    for i in range(len(latitudes)):
        X_train.append([latitudes[i],longitudes[i]])
    X_train = np.array(X_train)

    # fit a Gaussian Mixture Model with one component
    clf = gmm(latitudes, longitudes)
    center = clf.means_[0]
    
    # print location
    coordinates = (center[0], center[1])
    results = rg.search(coordinates)[0]    
    #print('User is located at: %s, %s' % (center[0],center[1]))
    #print('%s, %s' % (results['name'],results['admin1']))
    
    # plot users reviews on scatterplot
    # scatterplot(X_train, center, clf)
    
    return center[0], center[1], results['name'], results['admin1']

In [13]:
def locate_user(user):
    loc = user_locations(user)
    return locate(loc[0], loc[1])

In [20]:
for i in range(500,1000): 
    loc = locate_user(userLoc['user_id'][i])
    userLoc['gmm_latitude'][i] = loc[0]
    userLoc['gmm_longitude'][i] = loc[1]
    userLoc['gmm_city'][i] = loc[2]
    userLoc['gmm_state'][i] = loc[3]

User: [u'-0lkJ7L73QCyBy9CFlEMbw']
Reviews Included: 8
Total Reviews: 9


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


User: [u'-0lvsWf588dnw1kmCYYbRQ']
Reviews Included: 6
Total Reviews: 6
User: [u'-0mHKX97D-nIMEK4a_xVCQ']
Reviews Included: 2
Total Reviews: 10
User: [u'-0mY6OJwkx2WWRf9png10g']
Reviews Included: 2
Total Reviews: 6
User: [u'-0mrmedVkALZ0w-wROseZQ']
Reviews Included: 2
Total Reviews: 30
User: [u'-0muMwSLdns0RO_CiZZr6g']
Reviews Included: 1
Total Reviews: 3
User: [u'-0n8WMjkhy5ddeXLajS5qw']
Reviews Included: 1
Total Reviews: 1
User: [u'-0nLfoS1LBFsKT-29WtdaA']
Reviews Included: 3
Total Reviews: 6
User: [u'-0nU6VcSRIXrEBRpzQ81sw']
Reviews Included: 1
Total Reviews: 7
User: [u'-0n_ZiTUqTdxuL-DvhBchQ']
Reviews Included: 3
Total Reviews: 4
User: [u'-0njrYCmpFuyEmPpoq0Q_A']
Reviews Included: 16
Total Reviews: 22
User: [u'-0nn8SIOQd-OuDCj_MmIog']
Reviews Included: 1
Total Reviews: 21
User: [u'-0o2Bh6R4m0DLKpPxTUOCg']
Reviews Included: 1
Total Reviews: 34
User: [u'-0oBg8owTGPK1rA5cFHWTA']
Reviews Included: 2
Total Reviews: 15
User: [u'-0oR6dXVfcKOf-s4UgJufw']
Reviews Included: 1
Total Reviews: 2

User: [u'-17vUkGqkl1-Pkw5TTcTrw']
Reviews Included: 6
Total Reviews: 9
User: [u'-17vo-ag35TT-gccu6XbnA']
Reviews Included: 1
Total Reviews: 15
User: [u'-18GyWFB6EinwzNzu04C5Q']
Reviews Included: 1
Total Reviews: 2
User: [u'-18LP5sg3N7CNYTu92xYBQ']
Reviews Included: 3
Total Reviews: 13
User: [u'-18YJeEWx5JPyVNOlQKqyg']
Reviews Included: 1
Total Reviews: 2
User: [u'-18xKH4MNc7sZOr0wZtcbQ']
Reviews Included: 1
Total Reviews: 1
User: [u'-18zoSaTHn5bxNdeVSLrMA']
Reviews Included: 1
Total Reviews: 14
User: [u'-191gKrqDzXGUrpl7npkXw']
Reviews Included: 11
Total Reviews: 16
User: [u'-191lsPrknB-gE_-BzAriQ']
Reviews Included: 3
Total Reviews: 119
User: [u'-19AGgxNOk1rtapUqx5RBg']
Reviews Included: 2
Total Reviews: 3
User: [u'-19OWJW18aIrc1zbktkvpg']
Reviews Included: 1
Total Reviews: 46
User: [u'-19x1bavUQaF7o_gGu_fVw']
Reviews Included: 2
Total Reviews: 5
User: [u'-1A44e1S-j7c5_0MS8bcfA']
Reviews Included: 6
Total Reviews: 9
User: [u'-1A6IYfwlgaaZ2NWFMieLQ']
Reviews Included: 1
Total Reviews: 

User: [u'-1XJd9aCxVk903rNDExioA']
Reviews Included: 5
Total Reviews: 6
User: [u'-1XS4L8uFgB3no08IoIHRA']
Reviews Included: 1
Total Reviews: 1
User: [u'-1XYK7Iw3I-CsgdCzvKlLw']
Reviews Included: 1
Total Reviews: 4
User: [u'-1XoQ9EEnjUj_y2VEcgBcg']
Reviews Included: 7
Total Reviews: 7
User: [u'-1XoWvPIn-rVfJJhRLa_ig']
Reviews Included: 1
Total Reviews: 3
User: [u'-1YACsV4z7rm34CBqOYJ-w']
Reviews Included: 1
Total Reviews: 2
User: [u'-1YGFe7qDm2yl43AqCbMNg']
Reviews Included: 10
Total Reviews: 14
User: [u'-1YMDVgj9wcrhb6Olnva0w']
Reviews Included: 2
Total Reviews: 2
User: [u'-1YPP8INFTfrjoUe5OTtFw']
Reviews Included: 2
Total Reviews: 45
User: [u'-1YQDURCL6a_wuZmNXmKzw']
Reviews Included: 2
Total Reviews: 3
User: [u'-1YYwiZAadRbNqypbKjNVw']
Reviews Included: 2
Total Reviews: 18
User: [u'-1YqQbi8n6NtXxvuU0d-lg']
Reviews Included: 3
Total Reviews: 5
User: [u'-1YsI_5oyppCRToyZhdiSw']
Reviews Included: 4
Total Reviews: 4
User: [u'-1Z4LaMUkNapdYEKvKiM5A']
Reviews Included: 1
Total Reviews: 1
Us

User: [u'-1x8gXJnrI-FeZPNvnfbRg']
Reviews Included: 6
Total Reviews: 7
User: [u'-1xIfezEraPK2HRTHWIWMA']
Reviews Included: 1
Total Reviews: 1
User: [u'-1xh43lAhmrByuMzcQ2sNw']
Reviews Included: 6
Total Reviews: 150
User: [u'-1xx5JBawGkg_-BVHXPphQ']
Reviews Included: 1
Total Reviews: 21
User: [u'-1xzSuE5LYeCUEipCzaWgg']
Reviews Included: 2
Total Reviews: 2
User: [u'-1yGJJLmqVsEftxCF_u24Q']
Reviews Included: 2
Total Reviews: 7
User: [u'-1yH0yBVi9P9OU2wfuPQBA']
Reviews Included: 2
Total Reviews: 10
User: [u'-1yMTlr6vCQIhDfrwaalxw']
Reviews Included: 1
Total Reviews: 42
User: [u'-1yVLNCQbL2k6aXr4IeeFQ']
Reviews Included: 2
Total Reviews: 2
User: [u'-1yiMxBpuXFRCmdOI8mLHA']
Reviews Included: 2
Total Reviews: 7
User: [u'-1ykw3QEfkENFve23ACG6Q']
Reviews Included: 1
Total Reviews: 4
User: [u'-1ylb5p7SjuuWjqRNhlsDA']
Reviews Included: 1
Total Reviews: 3
User: [u'-1zQA2f_syMAdA04PUWNNw']
Reviews Included: 4
Total Reviews: 4
User: [u'-1zuExrGiB1oySlVTy4o3w']
Reviews Included: 1
Total Reviews: 5
U

User: [u'-2Jc11rAnH93phnpzG9neQ']
Reviews Included: 1
Total Reviews: 1
User: [u'-2JcoKQzZ9O6scHEEuH8dg']
Reviews Included: 1
Total Reviews: 46
User: [u'-2Jvpx4EHoCFcB2n7i212g']
Reviews Included: 4
Total Reviews: 7
User: [u'-2K0yp7lBT_JUOzGkpdJ_g']
Reviews Included: 1
Total Reviews: 5
User: [u'-2Ke8BKJYn8zWUgbNJAhFQ']
Reviews Included: 1
Total Reviews: 1
User: [u'-2Knew0B7qFed1Sd7StW7A']
Reviews Included: 1
Total Reviews: 7
User: [u'-2LAw4y4cqjP7vAbv8zCMQ']
Reviews Included: 3
Total Reviews: 11
User: [u'-2LMSDH0mOSgRnYcqicReQ']
Reviews Included: 1
Total Reviews: 12
User: [u'-2LO9Cavyl1ANkd4mZbZeA']
Reviews Included: 1
Total Reviews: 1
User: [u'-2LVjbw3pNGCPTlNx-5pYg']
Reviews Included: 1
Total Reviews: 1
User: [u'-2LW2r5N9KkGfbvs__WZEw']
Reviews Included: 1
Total Reviews: 3
User: [u'-2LjwTDhb6XXTwPHpMpfNw']
Reviews Included: 1
Total Reviews: 2
User: [u'-2LrFWoRv5Sm-QFhBLEnLw']
Reviews Included: 1
Total Reviews: 16
User: [u'-2LxKHpADfxD4Sq1CE97iw']
Reviews Included: 1
Total Reviews: 2
Us

In [21]:
with open('updatedReview.json', 'w') as f:
    f.write(review.to_json(orient='records', lines=True))

In [24]:
userLoc[:1000]

Unnamed: 0,user_id,gmm_latitude,gmm_longitude,gmm_city,gmm_state
0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
1,---94vtJ_5o_nikEs6hUjg,33.612122,-111.998175,Paradise Valley,Arizona
2,---PLwSf5gKdIoVnyRHgBA,33.617955,-111.949844,Paradise Valley,Arizona
3,---cu1hq55BP9DWVXXKHZg,33.538439,-112.057240,Phoenix,Arizona
4,---fhiwiwBYrvqhpXgcWDQ,36.123843,-115.170219,Paradise,Nevada
5,---udAKDsn0yQXmzbWQNSw,36.147153,-115.299976,Summerlin South,Nevada
6,--0RtXvcOIE4XbErYca6Rw,35.203507,-80.743549,Matthews,North Carolina
7,--0WZ5gklOfbUIodJuKfaQ,36.167903,-115.138076,Las Vegas,Nevada
8,--0kuuLmuYBe3Rmu0Iycww,36.136545,-115.267314,Spring Valley,Nevada
9,--0sXNBv6IizZXuV-nl0Aw,36.196944,-115.262117,Spring Valley,Nevada


In [41]:
sample = pd.merge(review, userLoc, on='user_id', right_index=True).dropna(axis=0, how='any')
sample

Unnamed: 0,business_id,date,latitude,longitude,review_count,review_id,stars_x,stars_y,user_id,gmm_latitude,gmm_longitude,gmm_city,gmm_state
0,GT0K4EdSSxe_LMU6SPr-_A,2012-06-09,36.128561,-115.171130,245,MbiEXTlp-rEbK-ZL-WDYew,5,4.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
1,d7Jgj1h_MILumtsTlb2aXA,2011-12-11,36.232720,-115.250343,245,X7-Kqkzg2J4rpKJGXvXfUg,5,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
2,2BbFeotL85cIaBjSq1SWiA,2010-10-17,36.168099,-115.192230,245,YYIAglOQmbto4z3LNv1T8w,1,2.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
3,y8d90Pt16Nip-B5UXWBP-w,2011-01-05,36.156796,-115.334387,245,8rTiUXrFD9J10RI4Q2O6Qw,4,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
4,eduRavkml8awmPachSZXuw,2010-11-09,36.163874,-115.289944,245,IQtxzBq4m-iPJDHYAWOGHg,1,3.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
5,rq5dgoksPHkJwJNQKlGQ7w,2010-10-16,36.164180,-115.289630,245,-UtICN8nUQ4g9qIHlQRrxw,5,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
6,VmssDHtnBjYJQ6cWiwV9kA,2012-09-07,36.159126,-115.193705,245,88D5hXtby96ZlAKPCMhGcw,4,4.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
7,bPcqucuuClxYrIM8xWoArg,2011-09-29,36.124840,-115.325222,245,-pk4s5YUD0grEEBt2QYlDA,5,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
8,78TC3sZSYBzBsSJ0z5pyhw,2010-11-05,36.112501,-115.170579,245,nHYLl06G_Yt8dcRpzCJFiQ,1,2.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
9,SfYMEQ2W-u6ixeakITwQ_g,2015-11-30,36.167685,-115.138382,245,Nym3-4o44VF2maR63LdI6w,1,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada


In [42]:
sample = sample.query('abs(latitude - gmm_latitude) < 0.5')
sample = sample.query('abs(longitude - gmm_longitude) < 0.5')
sample

Unnamed: 0,business_id,date,latitude,longitude,review_count,review_id,stars_x,stars_y,user_id,gmm_latitude,gmm_longitude,gmm_city,gmm_state
0,GT0K4EdSSxe_LMU6SPr-_A,2012-06-09,36.128561,-115.171130,245,MbiEXTlp-rEbK-ZL-WDYew,5,4.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
1,d7Jgj1h_MILumtsTlb2aXA,2011-12-11,36.232720,-115.250343,245,X7-Kqkzg2J4rpKJGXvXfUg,5,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
2,2BbFeotL85cIaBjSq1SWiA,2010-10-17,36.168099,-115.192230,245,YYIAglOQmbto4z3LNv1T8w,1,2.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
3,y8d90Pt16Nip-B5UXWBP-w,2011-01-05,36.156796,-115.334387,245,8rTiUXrFD9J10RI4Q2O6Qw,4,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
4,eduRavkml8awmPachSZXuw,2010-11-09,36.163874,-115.289944,245,IQtxzBq4m-iPJDHYAWOGHg,1,3.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
5,rq5dgoksPHkJwJNQKlGQ7w,2010-10-16,36.164180,-115.289630,245,-UtICN8nUQ4g9qIHlQRrxw,5,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
6,VmssDHtnBjYJQ6cWiwV9kA,2012-09-07,36.159126,-115.193705,245,88D5hXtby96ZlAKPCMhGcw,4,4.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
7,bPcqucuuClxYrIM8xWoArg,2011-09-29,36.124840,-115.325222,245,-pk4s5YUD0grEEBt2QYlDA,5,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
8,78TC3sZSYBzBsSJ0z5pyhw,2010-11-05,36.112501,-115.170579,245,nHYLl06G_Yt8dcRpzCJFiQ,1,2.5,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada
9,SfYMEQ2W-u6ixeakITwQ_g,2015-11-30,36.167685,-115.138382,245,Nym3-4o44VF2maR63LdI6w,1,4.0,---1lKK3aKOuomHnwAkAow,36.148785,-115.229453,Spring Valley,Nevada


In [43]:
with open('sampleDates.json', 'w') as f:
    f.write(sample.to_json(orient='records', lines=True))

In [None]:
import gmplot
def plot_user(user):
    loc = user_locations(user)
    prediction = location_dict[user]
    latitudes = []
    longitudes = []
    latitudes.append(prediction[0])
    longitudes.append(prediction[1])
    gmap = gmplot.GoogleMapPlotter(latitudes[0], longitudes[0], 13)
    gmap.scatter(loc[0], loc[1], 'cornflowerblue', edge_width=10)
    gmap.scatter(latitudes, longitudes, 'red', edge_width=20)
    gmap.draw("maps/%s.html" % user)

In [None]:
plot_user('rW8q706dz5-NnXDzMwVkiw')