In [98]:
# import libraries
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from dataparser import parse_json_file
from dataexploration import searched_city_dict
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import operator
import collections

In [99]:
# ------------------------------------
#1) Load data set and parse data into dataframe
#----------------------------
file_path = 'city_search.json'
df = parse_json_file(file_path)
df.head()

Unnamed: 0,session_id,unix_timestamp,cities,country,joining_date,user_id,access_date
0,X061RFWB06K9V,2015-09-17 15:28:28,"New York NY, Newark NJ",UK,2015-03-22,2024,2015-09-17
1,5AZ2X2A9BHH5U,2015-09-04 08:06:31,"New York NY, Jersey City NJ, Philadelphia PA",DE,2015-03-28,2853,2015-09-04
2,SHTB4IYAX4PX6,2015-08-29 10:18:10,San Antonio TX,UK,2015-03-06,10958,2015-08-29
3,JBRB8MZGTX3M4,2015-03-25 07:21:03,Edmonton AB,IT,2015-03-12,7693,2015-03-25
4,YJCMPURC2FL9C,2015-05-02 09:31:07,"Phoenix AZ, Houston TX",UK,2015-02-28,7506,2015-05-02


In [147]:
#--------------------
#2) Get user2cities matrix(dataframe)
#------------------

# Get all visited cities
cities_str = ','.join(df['cities'].values)
cities_set = set(city.strip() for city in cities_str.split(','))

# Get all users
user_array = df['user_id'].unique()
user_count = df['user_id'].nunique()

# Remove rarely visited cities
threahold = 20
least_visited_cities = [item for item in searched_city_dict.keys() if searched_city_dict.get(item) < threahold]
cities_set = cities_set - set(least_visited_cities)

# Generate user by cities matrix with all value as 0
data = np.zeros((user_count, len(cities_set)))
df_user2city = pd.DataFrame(data, index = list(user_array), columns = list(cities_set))
df_user2city.index.rename('user_id', inplace=True) 
#df_user_by_city.head(3)

# Count the # of searched time per user_id 
for user_id,group in df.groupby('user_id'):
    for item in group['cities'].iteritems(): 
      for c in item[1].split(','):
        if c.strip() in cities_set:
            df_user2city.loc[user_id, c.strip()] =  1
df_user2city = df_user2city.loc[(df_user2city !=0).any(axis=1),:]

df_user2city.head()

Unnamed: 0_level_0,London ON,Arlington TX,San Jose CA,Indianapolis IN,Buffalo NY,Kitchener ON,Calgary AB,Houston TX,Oshawa ON,Corpus Christi TX,...,Victoria BC,Portland OR,Long Beach CA,Hamilton ON,Sacramento CA,Santa Ana CA,Oklahoma City OK,Mesa AZ,Vancouver BC,OTTAWA ON
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2853,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7506,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
frequent_itemsets = apriori(df_user2city, min_support = 0.02, use_colnames = True)
rules = association_rules(frequent_itemsets, metric = 'lift', min_threshold = 1)
rules = rules.sort_values(by = ['lift'], ascending=False)
rules = rules[rules['lift'] > 3]
rules.head()
#print(rules.shape)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
154,(Santa Ana CA),(Anaheim CA),0.047083,0.044833,0.021118,0.448529,10.004457,0.019007,1.732036
155,(Anaheim CA),(Santa Ana CA),0.044833,0.047083,0.021118,0.471042,10.004457,0.019007,1.8015
750,"(Jersey City NJ, Philadelphia PA)",(Newark NJ),0.04241,0.088281,0.024234,0.571429,6.472829,0.02049,2.127344
755,(Newark NJ),"(Jersey City NJ, Philadelphia PA)",0.088281,0.04241,0.024234,0.27451,6.472829,0.02049,1.319922
920,"(New York NY, Newark NJ)","(Jersey City NJ, Philadelphia PA)",0.081357,0.04241,0.02233,0.274468,6.471845,0.01888,1.319846


In [155]:
rules.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1010 entries, 154 to 31
Data columns (total 9 columns):
antecedents           1010 non-null object
consequents           1010 non-null object
antecedent support    1010 non-null float64
consequent support    1010 non-null float64
support               1010 non-null float64
confidence            1010 non-null float64
lift                  1010 non-null float64
leverage              1010 non-null float64
conviction            1010 non-null float64
dtypes: float64(7), object(2)
memory usage: 78.9+ KB


In [158]:
def non_presonalized_recommend(searched_cities_str):
    searched_cities_list = []
    for ele in searched_cities_str.strip().split(','):
        searched_cities_list.append(ele.strip())
    print('searched_cities_list: ', searched_cities_list)   

    top_k = 10
    
    total_city_list = [ item for item in searched_city_dict.keys()] 
    if len(searched_cities_str.strip()) == 0:
        # it is a cold start   
        num = min(len(total_city_list), top_k)
        print('Recommend most popular cities:\n', total_city_list[0:num-1])
        recommend_list = total_city_list[:num-1]
    else:
        # it is a cold start, but user show a temporary interest list
        recommendcities = {}
        for index, row in rules.iterrows():
            for city in searched_cities_list:
                if city in row['antecedents']:
                    #print(str(row['antecedents']))
                    for item in row['consequents']:  
                        if item.strip() in recommendcities.keys():
                            recommendcities[item.strip()] = recommendcities[item.strip()] + 1
                        else:
                            recommendcities[item.strip()] = 1
        sorted_cities = sorted(recommendcities.items(), key=operator.itemgetter(1))
        sorted_cities.reverse()
        recommendcities = collections.OrderedDict(sorted_cities) 
        cities_association = [n for n in list(recommendcities.keys()) if n not in searched_cities_list]
        cities_top = []
        if len(cities_association) < top_k:
            num = top_k - len(cities_association)
            cities_top = [n for n in list(searched_city_dict.keys()) if n not in searched_cities_list][:num]
        #print('Recommend most relevant cities:\n', list(recommendcities.keys())[:num-1])
        recommend_list = cities_association + cities_top
    return recommend_list


#searched_cities_str = ' '
searched_cities_str = 'Dallas TX, New York NY'

recommend = non_presonalized_recommend(searched_cities_str)
print(recommend)
    

searched_cities_list:  ['Dallas TX', 'New York NY']
['Newark NJ', 'Jersey City NJ', 'Philadelphia PA', 'Toronto ON', 'Los Angeles CA', 'Montreal QC', 'Chicago IL', 'Vancouver BC', 'Houston TX', 'Philadelphia PA']
