In [536]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [537]:
# Load the data
df = pd.read_csv("all_data.csv", encoding="utf-8")
df.head()


Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,street,postcode,categories
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,"['access_limited', 'access_limited.private', '..."
1,-16236108,Ballenoil,40.373971,-3.776696,Spain,Community of Madrid,Madrid,,,Latina,Calle de la Sinfonía,28917,"['building', 'building.commercial', 'service',..."
2,-15931847,Bluespace,40.3735,-3.642409,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Calle Gamonal,28018,"['commercial', 'rental', 'rental.storage']"
3,-15931821,La parrilla del sur,40.37167,-3.639826,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Avenida de Moratilla de los Meleros,28031,"['building', 'building.catering', 'catering', ..."
4,-15931820,Maxicasa,40.371721,-3.639581,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Carretera de Villaverde a Vallecas,28031,"['building', 'building.commercial', 'commercia..."


In [538]:
# Split the categories into a list, remove brackets from the strings that have them and the single quotation marks too
df["categories"] = df["categories"].str.replace("[", "").str.replace("]", "").str.replace("'","").str.split(", ")
df_cats = df.explode("categories")
print(df_cats.categories)

0                     access_limited
0             access_limited.private
0                            service
0                  service.recycling
0           service.recycling.centre
                    ...             
21884            catering.restaurant
21884    catering.restaurant.italian
21884      catering.restaurant.pizza
21885                       catering
21885             catering.fast_food
Name: categories, Length: 61342, dtype: object


In [539]:
#Return a list of all the unique categories
print(df_cats.categories.unique())

categories = df_cats.categories.unique()

['access_limited' 'access_limited.private' 'service' 'service.recycling'
 'service.recycling.centre' 'building' 'building.commercial'
 'service.vehicle' 'service.vehicle.fuel' 'commercial' 'rental'
 'rental.storage' 'building.catering' 'catering' 'catering.restaurant'
 'commercial.houseware_and_hardware' 'building.industrial'
 'service.vehicle.repair' 'service.vehicle.repair.car'
 'commercial.elektronics' 'commercial.trade' 'building.office' 'office'
 'office.government' 'wheelchair' 'wheelchair.yes'
 'commercial.outdoor_and_sport' 'commercial.shopping_mall'
 'commercial.supermarket' 'building.facility' 'service.police'
 'building.public_and_civil' 'production' 'production.factory'
 'service.social_facility' 'office.educational_institution'
 'office.government.ministry' 'office.government.social_security'
 'office.foundation' 'service.financial' 'service.financial.bank'
 'office.company' 'office.research' 'building.historic'
 'office.government.administrative' 'tourism' 'tourism.sights

In [540]:
df_cats

Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,street,postcode,categories
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,access_limited
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,access_limited.private
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,service
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,service.recycling
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,service.recycling.centre
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21884,11835944044,Eccolo,40.409095,-3.705602,Spain,Community of Madrid,Madrid,Centro,Lavapiés,Embajadores,Calle de Embajadores,28012,catering.restaurant
21884,11835944044,Eccolo,40.409095,-3.705602,Spain,Community of Madrid,Madrid,Centro,Lavapiés,Embajadores,Calle de Embajadores,28012,catering.restaurant.italian
21884,11835944044,Eccolo,40.409095,-3.705602,Spain,Community of Madrid,Madrid,Centro,Lavapiés,Embajadores,Calle de Embajadores,28012,catering.restaurant.pizza
21885,11836002693,Pecados Argetinos,40.409834,-3.707097,Spain,Community of Madrid,Madrid,Centro,Lavapiés,,Calle Ribera de Curtidores,28005,catering


In [541]:
#Create a dictionary with all the main categories (before the dot) as keys and a list of all the subcategories as values
categories_dict = {}
for cat in categories:
    main_cat = cat.split(".")[0]
    #Throw out the categories that have more than one dot
    if len(cat.split(".")) > 2:
        continue
    if main_cat in categories_dict:
        categories_dict[main_cat].append(cat)
    else:
        categories_dict[main_cat] = [cat]
#From each key in the dictionary, remove all the values that do not have a period in them
for key in categories_dict:
    categories_dict[key] = [cat for cat in categories_dict[key] if "." in cat]

In [542]:
new_cats_dict = {k: v for k, v in categories_dict.items() if k in ["commercial", "production", "office", "service", "catering"]}
#print the number of values of all the values as keys
print({k: len(v) for k, v in new_cats_dict.items()})

{'service': 15, 'commercial': 41, 'catering': 9, 'office': 31, 'production': 4}


In [543]:
new_cats_dict["commercial"]

['commercial.houseware_and_hardware',
 'commercial.elektronics',
 'commercial.trade',
 'commercial.outdoor_and_sport',
 'commercial.shopping_mall',
 'commercial.supermarket',
 'commercial.marketplace',
 'commercial.department_store',
 'commercial.tickets_and_lottery',
 'commercial.furniture_and_interior',
 'commercial.books',
 'commercial.convenience',
 'commercial.garden',
 'commercial.vehicle',
 'commercial.health_and_beauty',
 'commercial.florist',
 'commercial.smoking',
 'commercial.food_and_drink',
 'commercial.kiosk',
 'commercial.clothing',
 'commercial.hobby',
 'commercial.toy_and_game',
 'commercial.discount_store',
 'commercial.newsagent',
 'commercial.pet',
 'commercial.gift_and_souvenir',
 'commercial.stationery',
 'commercial.jewelry',
 'commercial.bag',
 'commercial.chemist',
 'commercial.art',
 'commercial.erotic',
 'commercial.watches',
 'commercial.second_hand',
 'commercial.video_and_music',
 'commercial.antiques',
 'commercial.gas',
 'commercial.baby_goods',
 'commer

In [544]:
new_cats_dict["production"]

['production.factory',
 'production.brewery',
 'production.pottery',
 'production.winery']

In [545]:
new_cats_dict["office"]

['office.government',
 'office.educational_institution',
 'office.foundation',
 'office.company',
 'office.research',
 'office.diplomatic',
 'office.insurance',
 'office.political_party',
 'office.employment_agency',
 'office.non_profit',
 'office.estate_agent',
 'office.association',
 'office.financial',
 'office.it',
 'office.notary',
 'office.energy_supplier',
 'office.coworking',
 'office.lawyer',
 'office.charity',
 'office.security',
 'office.travel_agent',
 'office.architect',
 'office.tax_advisor',
 'office.accountant',
 'office.religion',
 'office.newspaper',
 'office.telecommunication',
 'office.consulting',
 'office.advertising_agency',
 'office.logistics',
 'office.financial_advisor']

In [546]:
new_cats_dict["service"]

['service.recycling',
 'service.vehicle',
 'service.police',
 'service.social_facility',
 'service.financial',
 'service.funeral_directors',
 'service.post',
 'service.beauty',
 'service.estate_agent',
 'service.taxi',
 'service.travel_agency',
 'service.cleaning',
 'service.bookmaker',
 'service.tailor',
 'service.locksmith']

In [547]:
new_cats_dict["catering"]

['catering.restaurant',
 'catering.pub',
 'catering.fast_food',
 'catering.bar',
 'catering.cafe',
 'catering.taproom',
 'catering.biergarten',
 'catering.ice_cream',
 'catering.food_court']

In [548]:
#I need to load the data again into another dataframe

df2 = pd.read_csv("all_data.csv", encoding="utf-8")
df2.head()

Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,street,postcode,categories
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,"['access_limited', 'access_limited.private', '..."
1,-16236108,Ballenoil,40.373971,-3.776696,Spain,Community of Madrid,Madrid,,,Latina,Calle de la Sinfonía,28917,"['building', 'building.commercial', 'service',..."
2,-15931847,Bluespace,40.3735,-3.642409,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Calle Gamonal,28018,"['commercial', 'rental', 'rental.storage']"
3,-15931821,La parrilla del sur,40.37167,-3.639826,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Avenida de Moratilla de los Meleros,28031,"['building', 'building.catering', 'catering', ..."
4,-15931820,Maxicasa,40.371721,-3.639581,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Carretera de Villaverde a Vallecas,28031,"['building', 'building.commercial', 'commercia..."


In [549]:
#Remove the category list because I need to work with it
df3 = df2.drop(columns="categories")
#From df2.categories, remove the brackets and the single quotation marks and split the strings into lists.
df2["categories"] = df2["categories"].str.replace("[", "").str.replace("]", "").str.replace("'","").str.split(", ")
#Clean df2.categories, leaving only those in the new_cats_dict
df2["categories"] = df2["categories"].apply(lambda x: [cat for cat in x if any([cat in new_cats_dict[subcat] for subcat in new_cats_dict])])

#Add 5 columns with the main categories
# df3["catering"] = df2["categories"].apply(lambda x: 1 if any([cat in x for cat in new_cats_dict["catering"]]) else 0)
# df3["commercial"] = df2["categories"].apply(lambda x: 1 if any([cat in x for cat in new_cats_dict["commercial"]]) else 0)
# df3["office"] = df2["categories"].apply(lambda x: 1 if any([cat in x for cat in new_cats_dict["office"]]) else 0)   
# df3["production"] = df2["categories"].apply(lambda x: 1 if any([cat in x for cat in new_cats_dict["production"]]) else 0)
# df3["service"] = df2["categories"].apply(lambda x: 1 if any([cat in x for cat in new_cats_dict["service"]]) else 0)
#I want to do this in a more efficient way
for key in new_cats_dict:
    df3[key] = df2["categories"].apply(lambda x: 1 if any([cat in x for cat in new_cats_dict[key]]) else 0)
df3.head()

Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,street,postcode,service,commercial,catering,office,production
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,Calle Dehesa Vieja,28052,1,0,0,0,0
1,-16236108,Ballenoil,40.373971,-3.776696,Spain,Community of Madrid,Madrid,,,Latina,Calle de la Sinfonía,28917,1,0,0,0,0
2,-15931847,Bluespace,40.3735,-3.642409,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Calle Gamonal,28018,0,0,0,0,0
3,-15931821,La parrilla del sur,40.37167,-3.639826,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Avenida de Moratilla de los Meleros,28031,0,0,1,0,0
4,-15931820,Maxicasa,40.371721,-3.639581,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,Carretera de Villaverde a Vallecas,28031,0,1,0,0,0


In [556]:
#Now I need to add subcategories to the dataframe, so I will add a column for a subcategory list for each main category.
for cat in new_cats_dict.keys():
    df3[cat + "_subcat"] = ""

#These subcategories columns will be filled with the subcategories that are only the categories list of each row, but only those that are exact matches of the elements in new_cats_dict values (make sure the key is not filled in).
#If there is more than one subcategory in new_cats_dict values, the two will be filled in the same row separated by a comma, provided both are different from each other. If they are not different, only one will be filled.
#Then create a column for each subcategory and fill it with 1 if the subcategory is in the list of subcategories of the row and 0 if it is not.
for i, row in df2.iterrows():
    for cat in row["categories"]:
        main_cat = cat.split(".")[0]
        if main_cat in new_cats_dict:
            subcat = cat
            if subcat in new_cats_dict[main_cat]:
                if df3.at[i, main_cat + "_subcat"] == "":
                    df3.at[i, main_cat + "_subcat"] = subcat
                else:
                    if subcat != df3.at[i, main_cat + "_subcat"]:
                        df3.at[i, main_cat + "_subcat"] += ", " + subcat
df3


Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,...,service,commercial,catering,office,production,service_subcat,commercial_subcat,catering_subcat,office_subcat,production_subcat
0,-17027791,Almacén de la villa,40.393615,-3.600524,Spain,Community of Madrid,Madrid,,,Vicálvaro,...,1,0,0,0,0,service.recycling,,,,
1,-16236108,Ballenoil,40.373971,-3.776696,Spain,Community of Madrid,Madrid,,,Latina,...,1,0,0,0,0,service.vehicle,,,,
2,-15931847,Bluespace,40.373500,-3.642409,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,...,0,0,0,0,0,,,,,
3,-15931821,La parrilla del sur,40.371670,-3.639826,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,...,0,0,1,0,0,,,catering.restaurant,,
4,-15931820,Maxicasa,40.371721,-3.639581,Spain,Community of Madrid,Madrid,,,Villa de Vallecas,...,0,1,0,0,0,,commercial.houseware_and_hardware,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21881,11833111578,Pink’s,40.428734,-3.677708,Spain,Community of Madrid,Madrid,,,Salamanca,...,0,0,1,0,0,,,catering.fast_food,,
21882,11834724643,Po Yo Que Sé,40.402102,-3.561043,Spain,Community of Madrid,Madrid,,,Vicálvaro,...,0,0,1,0,0,,,catering.fast_food,,
21883,11835358178,IN PULSO,40.392546,-3.680149,Spain,Community of Madrid,Madrid,,,Arganzuela,...,0,0,1,0,0,,,catering.restaurant,,
21884,11835944044,Eccolo,40.409095,-3.705602,Spain,Community of Madrid,Madrid,Centro,Lavapiés,Embajadores,...,0,0,1,0,0,,,catering.restaurant,,


In [551]:
#Show me all the rows where a specific column's value = 1
df3["catering.restaurant"].value_counts()

KeyError: 'catering.restaurant'