In [127]:
# import libraries
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from dataparser import parse_json_file
from dataexploration import searched_city_dict

In [128]:
# ------------------------------------
#1) Load data set and parse data into dataframe
#----------------------------
file_path = 'city_search.json'
df = parse_json_file(file_path)
df.head()

Unnamed: 0,session_id,unix_timestamp,cities,country,joining_date,user_id,access_date
0,X061RFWB06K9V,2015-09-17 15:28:28,"New York NY, Newark NJ",UK,2015-03-22,2024,2015-09-17
1,5AZ2X2A9BHH5U,2015-09-04 08:06:31,"New York NY, Jersey City NJ, Philadelphia PA",DE,2015-03-28,2853,2015-09-04
2,SHTB4IYAX4PX6,2015-08-29 10:18:10,San Antonio TX,UK,2015-03-06,10958,2015-08-29
3,JBRB8MZGTX3M4,2015-03-25 07:21:03,Edmonton AB,IT,2015-03-12,7693,2015-03-25
4,YJCMPURC2FL9C,2015-05-02 09:31:07,"Phoenix AZ, Houston TX",UK,2015-02-28,7506,2015-05-02


In [129]:
#--------------------
#2) Get user2cities matrix(dataframe)
#------------------

# Get all visited cities
cities_str = ','.join(df['cities'].values)
cities_set = set(city.strip() for city in cities_str.split(','))

# Get all users
user_array = df['user_id'].unique()
user_count = df['user_id'].nunique()

# Remove rarely visited cities
threahold = 20
least_visited_cities = [item for item in searched_city_dict.keys() if searched_city_dict.get(item) < threahold]
cities_set = cities_set - set(least_visited_cities)

# Generate user by cities matrix with all value as 0
data = np.zeros((user_count, len(cities_set)))
df_user_by_city = pd.DataFrame(data, index = list(user_array), columns = list(cities_set))
df_user_by_city.index.rename('user_id', inplace=True) 
#df_user_by_city.head(3)

# Count the # of searched time per user_id 
for user_id,group in df.groupby('user_id'):
    for item in group['cities'].iteritems(): 
      for c in item[1].split(','):
        if c.strip() in cities_set:
            df_user_by_city.loc[user_id, c.strip()] = df_user_by_city.loc[user_id, c.strip()] + 1
df_user_by_city = df_user_by_city.loc[(df_user_by_city !=0).any(axis=1),:]

df_user_by_city.head()

Unnamed: 0_level_0,San Diego CA,Vancouver BC,Montreal QC,Riverside CA,Seattle WA,Portland OR,Corpus Christi TX,Fort Wayne IN,Fort Worth TX,Baltimore MD,...,Austin TX,Lexington KY,Windsor ON,Los Angeles CA,Columbus OH,Louisville KY,Santa Ana CA,Anaheim CA,Boston MA,Norfolk VA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2853,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10958,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7693,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
7506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
# Calculate similarity matrix (between cities)
sparse_data = sparse.csr_matrix(df_user_by_city)
similaries = cosine_similarity(sparse_data.transpose())
similarity_matrix = pd.DataFrame(data=similaries, index = df_user_by_city.columns, columns = df_user_by_city.columns)

similarity_matrix.head()

Unnamed: 0,San Diego CA,Vancouver BC,Montreal QC,Riverside CA,Seattle WA,Portland OR,Corpus Christi TX,Fort Wayne IN,Fort Worth TX,Baltimore MD,...,Austin TX,Lexington KY,Windsor ON,Los Angeles CA,Columbus OH,Louisville KY,Santa Ana CA,Anaheim CA,Boston MA,Norfolk VA
San Diego CA,1.0,0.153038,0.195024,0.228168,0.072586,0.023928,0.033879,0.068319,0.040655,0.058733,...,0.072887,0.038161,0.02784,0.216911,0.086154,0.034628,0.300495,0.258271,0.079473,0.017301
Vancouver BC,0.153038,1.0,0.246816,0.092304,0.326851,0.276571,0.061675,0.056077,0.054823,0.080784,...,0.096751,0.038595,0.050949,0.275043,0.111234,0.026683,0.100604,0.092873,0.061238,0.007777
Montreal QC,0.195024,0.246816,1.0,0.10058,0.087042,0.053229,0.077485,0.085281,0.082431,0.102888,...,0.111946,0.067316,0.06193,0.324776,0.1561,0.055367,0.125048,0.121489,0.209937,0.036481
Riverside CA,0.228168,0.092304,0.10058,1.0,0.032191,0.014149,0.007012,0.049181,0.028047,0.038897,...,0.042426,0.0,0.024693,0.253636,0.03035,0.010238,0.338834,0.369711,0.04112,0.0
Seattle WA,0.072586,0.326851,0.087042,0.032191,1.0,0.396267,0.033857,0.011874,0.0474,0.04226,...,0.081945,0.019068,0.031796,0.089698,0.051293,0.019775,0.033137,0.051621,0.034038,0.011526


In [133]:
# Construct a neighborhood of top k cities that most similar to each city
k = 15
city_neighbours = pd.DataFrame(index = similarity_matrix.columns, columns = range(1,k+1))
for i in range(0, len(similarity_matrix.columns)):
    city_neighbours.iloc[i, :k] = similarity_matrix.iloc[0:, i].sort_values(ascending=False)[:k].index
    
city_neighbours.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
San Diego CA,San Diego CA,Santa Ana CA,Anaheim CA,New York NY,Riverside CA,Los Angeles CA,Toronto ON,Long Beach CA,Montreal QC,Chicago IL,Houston TX,Vancouver BC,Philadelphia PA,Jersey City NJ,Phoenix AZ
Vancouver BC,Vancouver BC,Victoria BC,New York NY,Seattle WA,Toronto ON,Portland OR,Los Angeles CA,Montreal QC,Calgary AB,Chicago IL,Houston TX,Edmonton AB,Philadelphia PA,San Antonio TX,Phoenix AZ
Montreal QC,Montreal QC,New York NY,Toronto ON,OTTAWA ON,Los Angeles CA,Chicago IL,Houston TX,Vancouver BC,Oshawa ON,Philadelphia PA,Boston MA,San Diego CA,San Antonio TX,Dallas TX,Phoenix AZ
Riverside CA,Riverside CA,Anaheim CA,Santa Ana CA,Long Beach CA,Los Angeles CA,San Diego CA,New York NY,Toronto ON,Montreal QC,Vancouver BC,Chicago IL,Kitchener ON,Philadelphia PA,Phoenix AZ,Calgary AB
Seattle WA,Seattle WA,Victoria BC,Portland OR,Vancouver BC,Calgary AB,Edmonton AB,New York NY,San Antonio TX,Toronto ON,Los Angeles CA,Houston TX,Montreal QC,Austin TX,Chicago IL,Philadelphia PA


In [134]:
def recommendcities(user_id):
    user_vector = df_user_by_city.loc[user_index]
    searched_cities = user_vector[user_vector > 0].index.values


    # Get the neighhood cities that similar to the cities the user has earched
    similar_cities = set()
    for cities in city_neighbours.loc[searched_cities.tolist()].values.tolist():
        for city in cities: 
            similar_cities.add(city)        
    #print(len(similar_cities))

    neighbourhood = similarity_matrix[list(similar_cities)].loc[list(similar_cities)]
    #neighbourhood.head(32)

    user_vector = user_vector.loc[list(similar_cities)]

    score = neighbourhood.dot(user_vector).div(neighbourhood.sum(axis=1))

    score = score.drop(searched_cities)

    return score.nlargest(10).index.values

# For test purpose
user_id = 2853
recommendations = recommendcities(user_id)
print('Recommend top 10 cities for user (user_id:', 
      user_id, '):','\n',recommendations)

Recommend top 10 cities for user (user_id: 2853 ): 
 ['Newark NJ' 'Toronto ON' 'Hamilton ON' 'OTTAWA ON' 'Los Angeles CA'
 'Chicago IL' 'Baltimore MD' 'Boston MA' 'Houston TX' 'Phoenix AZ']
