In [None]:
# =============================================================================
# MODULES,  ALLIASES AND GLOBAL ATTRIBUTES
# =============================================================================
import requests, json
import pandas as pd
import numpy as np
import seaborn as sns
import os
from tqdm import tqdm  
from googletrans import Translator
import folium
from folium.plugins import MarkerCluster
import matplotlib.pyplot as plt


# Global attributes
api_key = 'insert your own API KEY here'
url = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
# Aliases
join=os.path.join
make=os.makedirs
exists=os.path.exists
# =============================================================================

In [None]:
# =============================================================================
# DATA FETCHING & ONLINE PREPROCESSING
# =============================================================================
def construct_search_query(key='accountant',town='Xanthi'):
    '''
    Create a search query based on the venue of interest and the town. 
    Translate the query in Greek to fetch venues with only Greek domains.
    
    Parameters
    ----------
    key: STR, 
        Provide the search target, optional.
        The default is 'accountant'.

    town: STR, optional
        Provide the name of the town that you'd wish to perform the search.
        The default is 'Xanthi'.

    Returns
    -------
    query: LIST.
        A list of all possible queries in English and Greek using the 
        provided parameters. The list includes capitalized and upper versions
        of the queries in both languages.

    '''
    translator = Translator()

    # To extend the search space create queries with and without determiners
    english_queries=[]
    for comb in range(0,6): #comb=combinations
        if comb==0:
            # simply combine with space
            tmp_query=(key+' '+town).strip()
        elif comb==1:
            # add a preposition
            tmp_query=(key+' in '+town).strip()
        elif comb==2:
            # capitalize each word
            tmp_query=(key.capitalize()+' '+town.capitalize()).strip()
        elif comb==3:
            # capitalize each word
            tmp_query=(key.capitalize()+' in '+town.capitalize()).strip()
        elif comb==4:
            # make upper
            tmp_query=(key.upper()+' in '+town.upper()).strip()   
        elif comb==5:
            # make upper
            tmp_query=(key.upper()+' '+town.upper()).strip()   
        english_queries.append(tmp_query)
    

    # Now translate the English queries to Greek and construct the search list
        
    translations = translator.translate(english_queries, dest='greek')
    greek_queries=[translations[i].text for i in range(0,len(translations))]
    # combine the queries from both languages
    queries=english_queries + greek_queries

    return queries

def fetch_data(queries,key='accountant',town='Xanthi'):
    '''
    Using the queries created with f'construct_search_query', 
    return the features of interest using the Google Places API.    

    Parameters
    ----------
    queries : LIST
        Queries and variants in English and Greek.

    Returns
    -------
    feature_matrix: DATAFRAME
    Contains the name, rating and coordinates of the place

    '''

    # features of interest (foi)
    foi=['name', 'user_ratings_total','rating', 'latitude', 'longitude']
    # Initialize containers for features of interest
    names, user_ratings_total,rating, rating, latitude, longitude=\
        ([] for i in range(0,6))
    
    for query in tqdm(queries):    
        # return response object 
        r = requests.get(url + 'query=' + query +
						'&key=' + api_key) 
        x = r.json()
        
        for result in range(0,len(x['results'])):
            names.append(x['results'][result]['name'])
            try:
                user_ratings_total.append(x['results'][result]['user_ratings_total'])
            except:
                print(f'{key, user_ratings_total} not found, setting to NaN')
                user_ratings_total.append(np.nan)
            try:    
                rating.append(x['results'][result]['rating'])
            except:
                print(f'{key, rating} not found, setting to NaN')
                rating.append(np.nan)
            latitude.append(x['results'][result]['geometry']['location']['lat'])
            longitude.append(x['results'][result]['geometry']['location']['lng'])
            

    # Construct dataframe
    df=pd.DataFrame(columns=foi)    
    df.name=names
    df.user_ratings_total=user_ratings_total
    df.rating=rating
    df.latitude=latitude
    df.longitude=longitude

    # Now remove duplicates
    df = df.drop_duplicates(subset='name', keep="first")
    # Save the dataframe locally
    path2data=join(os.path.realpath('..'),'Data', town)
    if not exists(path2data): make(path2data)
    fname=join(path2data,key+'_'+town+'.csv')
    df.to_csv(fname, encoding='utf-8-sig')
    
    return df


####################################################################
#   Fetch data for accountants, lawyers, and banks
####################################################################
keys=['accountant', 'lawyer','bank', 'insurer']
for town in ['Xanthi','Thessaloniki', 'Athens','heraklion', 'Patras']:
    for key in keys:
        print(key)
        # Get queries in English and Greek
        queries=construct_search_query(key,town)
        # Store preprocessed dataframes
        fetch_data(queries,key,town)



In [None]:
# =============================================================================
# PREPROCESSING
# =============================================================================

# Path declaration
path2figs=join(os.path.realpath('..'),'Figures')

# =============================================================================
towns=['Athens','heraklion','Patras','Thessaloniki','Xanthi']
target_venues=['lawyer', 'bank', 'accountant', 'insurer']

def load_data(town):
    # == load .csvs == #
    path2data=join(os.path.realpath('..'),'Data', town)
    files=see(path2data)
    data={} # initialize dictionary to hold the data
    for idx,file in enumerate(files):
        target_name=files[idx].split('_')[0] # eg: accountant
        data[target_name]=load(join(path2data, file))
    
    return data



In [None]:

# =============================================================================
# EXPLORATORY DATA ANALYSIS
# =============================================================================
def plot_dist_of_total_ratings(town, data):
    # Q1:  Can we use the rating as a reliable factor??
    curr_path2figs=join(path2figs, '#n_ratings',town)
    if not exists(curr_path2figs): make(curr_path2figs)

    for target in target_venues:
        sns.distplot(data[target].user_ratings_total, hist=False, kde=True, label=target)
    plt.legend()
    plt.title(f'{town}')
    plt.ylabel('Gaussian kernel density estimate')
    plt.xlabel('#ratings')
    plt.tight_layout()
    sns.despine()
    figname=join(curr_path2figs,f'{town}_total_ratings.png')
    plt.savefig(figname, dpi=150)
    plt.close()


for town in towns: 
    # load data
    data=load_data(town)
    # plot the total number of ratings
    plot_dist_of_total_ratings(town, data)

In [None]:
# =============================================================================
# DATA PREPROCESSING - OUTLIER DETECTION
# =============================================================================


def plot_mean_longitude(data,state):

    longs=[np.mean(data[target].longitude) for target in target_venues ]
    error=[np.std(data[target].longitude) for target in target_venues ]
    x_pos=np.arange(0,len(target_venues))
    # 2. We need to remove outliers from the lawyer and bank offices
    # Build the plot
    fig, ax = plt.subplots()
    ax.bar(x_pos, longs, yerr=error, align='center', alpha=0.2, ecolor='black', capsize=10)
    ax.set_ylabel('Longitude')
    ax.set_xticks(x_pos)
    ax.set_xticklabels(target_venues)
    plt.title('Longitude means & STDs')
    plt.tight_layout()
    sns.despine()
    figname=join(path2figs,f'longitude_{state}_.png')
    plt.savefig(figname, dpi=150)
    plt.close()

# %%
town='Athens'
# focus on a single town  
data=load_data(town)
# plot the means before the outlier removal 
plot_mean_longitude(data,'before_pre')
# remove outliers
remove_outliers_from=['bank', 'lawyer']
for venue in remove_outliers_from:
    data[venue].drop(data[venue][data[venue].longitude<10].index, inplace=True)
# plot again to verify    
plot_mean_longitude(data,'after_pre')


    
    

colors=['g','r','b','m']

for venue, color in zip(target_venues,colors):
    curr_data=data[venue]
    # curr_data=target_bad_ratings_area(data, venue, thr)
    
    plt.scatter(curr_data.latitude, curr_data.longitude, color=color,
                # s=1e1*data[venue].rating,
                label=venue)
plt.title(town)
plt.legend()
sns.despine(offset=10, trim=True)
plt.show()


In [None]:
# =============================================================================
# CLUSTERING
# =============================================================================
# Import model
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
distance=sklearn.metrics.pairwise.euclidean_distances

centroids_coords={}


for idx, venue in enumerate(target_venues):
    curr_venue=data[venue][['latitude','longitude']]
    
    
    # per venue use the silhouette_score and select the optimal k 
    sil = []
    kmax = 10
    for k in range(2, kmax+1):
      kmeans = KMeans(n_clusters = k).fit(curr_venue)
      labels = kmeans.labels_
      sil.append(silhouette_score(curr_venue, labels, metric = 'euclidean'))

    kmeans = KMeans(n_clusters=np.argmax(sil)+1).fit(curr_venue)
    centroids = kmeans.cluster_centers_
    centroids_coords[venue]={}
    centroids_coords[venue]['coords']=centroids
    
    plt.subplot(2,2,idx+1)
    plt.scatter(curr_venue['latitude'], curr_venue['longitude'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
    plt.title(f'{venue, town}')
    sns.despine(offset=10, trim=False)
    plt.ylim(23.67,23.82 )
plt.tight_layout()
fig = plt.gcf()
plt.suptitle('K-means clustered coordinates for all venues of interest. ', y=1.05)
fig.set_size_inches(10, 6)
plt.show()
      
#%%
for idx, venue in enumerate(target_venues):
    centroids=centroids_coords[venue]['coords']
    plt.scatter(centroids[:, 0], centroids[:, 1], c=colors[idx], s=50, label=venue)    

plt.title('Spatial distribution of venues-centroids for the town of Athens', y=1.2, style='oblique')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.25),
          ncol=2, fancybox=True, shadow=True, title='Professions.')
plt.xlabel(r'$\mathcal{latitude}$')
plt.ylabel(r'$\mathcal{longitude}$')
sns.despine(offset=10, trim=False)
# plt.tight_layout()



def parse_synergetic_and_antagonistic_elements(centroids_coords):
    ## SYNERGETIC ELEMENTS
    synergetic_lat=[centroids_coords[cat]['coords'][:,0] for cat in ['bank']]
    synergetic_lat = np.array([item for sublist in synergetic_lat for item in sublist])


    # Antagonistic elements longtitude
    synergetic_long=[centroids_coords[cat]['coords'][:,1] for cat in ['bank']]
    synergetic_long = np.array([item for sublist in synergetic_long for item in sublist])
      
    
    ## ANTAGONISTIC ELEMENTS
    # Antagonistic elements latitude
    antagonistic_lat=[centroids_coords[cat]['coords'][:,0] for cat in ['lawyer', 'accountant', 'insurer']]
    antagonistic_lat = np.array([item for sublist in antagonistic_lat for item in sublist])

    # Antagonistic elements longtitude
    antagonistic_long=[centroids_coords[cat]['coords'][:,1] for cat in ['lawyer', 'accountant', 'insurer']]
    antagonistic_long = np.array([item for sublist in antagonistic_long for item in sublist])

    elements={}
    for item in ['syn','ant']: elements[item]={}
    elements['syn']['lat'] =synergetic_lat
    elements['syn']['long']=synergetic_long    

    elements['ant']['lat'] =antagonistic_lat
    elements['ant']['long']=antagonistic_long    

    return elements

def plot_superimposed_grid(elements, candidate_lat, canditate_long, mean_lat, mean_long):
    
    plt.scatter(elements['ant']['lat'], elements['ant']['long'],  label='antagonistic elements', s=50, edgecolors='k')
    plt.scatter(elements['syn']['lat'], elements['syn']['long'], color='red', label='synergetic elements', s=50, edgecolors='k')
    plt.scatter(candidate_lat,canditate_long, marker='.', color='k', alpha=0.5, label='candidate coords', zorder=0)  
    plt.scatter(mean_lat, mean_long, marker='*', s=150, c='magenta', edgecolors='k', label='grid search start point.')
    plt.axvspan(37.95, 37.98, color='coral', alpha=0.25, zorder=0)
    plt.axhspan(23.72, 23.76, color='coral', alpha=0.25, zorder=0)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
          fancybox=True, shadow=True, ncol=2)
    plt.xlabel('Latitude')
    plt.ylabel('Longitude')
    plt.title('The grid of candidate coordinates.')
    plt.tight_layout()

def select_nodes_around_a_synergetic_node(elements, from_lat=37.95, to_lat=37.98):
    # Select a set of antagonistic elements around a synergetic node
    nodes_of_interest_lat, nodes_of_interest_long=([] for i in range(0,2))
    for idx,node in enumerate(elements['ant']['lat']):
        if node>from_lat and node<to_lat:
            nodes_of_interest_lat.append(node)
            nodes_of_interest_long.append(elements['ant']['long'][idx])

    mean_lat=np.mean(nodes_of_interest_lat)
    mean_long=np.mean(nodes_of_interest_long)
    
    return mean_lat, mean_long
    
def define_radius_of_interest(elements):
    '''
    The objective is to minimize the distance from a synergetic node while 
    maximizing the distance from all the other antagonistic elements. 
    '''


    # Define the range of all possible latitude values 
    lat_range  =np.linspace(37.91,38.02,51)
    long_range =np.linspace(23.7,23.8,51)
    
    # Construct the grid (Grid of possible establishments)
    candidate_lat, canditate_long = np.meshgrid(lat_range, long_range, sparse=False)
    # Get the closest nodes around a synergetic node based on a latitude range
    mean_lat, mean_long=select_nodes_around_a_synergetic_node(elements)
    # plot the coordinates for visual search
    plot_superimposed_grid(elements, candidate_lat, canditate_long, mean_lat, mean_long)
        

    


# split the companies into synergetic and antagonistic
elements= parse_synergetic_and_antagonistic_elements(centroids_coords)  

