In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
df = pd.read_csv("businesses.csv", encoding="utf-8")
df_2 = pd.read_csv("production.csv", encoding="utf-8")
#Add df_2 to df
df.head()


Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,street,postcode,categories
0,3295044257,Transportes La Ruta Ibérica,40.333346,-3.712226,Spain,Community of Madrid,Madrid,,,Villaverde,Calle Laguna del Marquesado,28091,"['office', 'office.company']"
1,3295044256,"Solener, SA",40.333961,-3.713119,Spain,Community of Madrid,Madrid,,,Villaverde,Avenida Real de Pinto,28021,"['office', 'office.company']"
2,3158288079,"Toldos La Paella, SL",40.33865,-3.715053,Spain,Community of Madrid,Madrid,,,Villaverde,Calle del Valle de Tobalina,28091,['office']
3,3158288076,Import Supply,40.339014,-3.715058,Spain,Community of Madrid,Madrid,,,Villaverde,Calle del Valle de Tobalina,28091,['office']
4,4346540453,EULEN.com,40.340444,-3.711864,Spain,Community of Madrid,Madrid,,,Villaverde,Calle Valle de Tobalina,28021,"['office', 'office.company']"


In [3]:
# Split the categories into a list, remove brackets from the strings that have them and the single quotation marks too
df["categories"] = df["categories"].str.replace("[", "").str.replace("]", "").str.replace("'","").str.split(", ")
df_cats = df.explode("categories")
print(df_cats.categories)

0                               office
0                       office.company
1                               office
1                       office.company
2                               office
                     ...              
1622    office.educational_institution
1623                            office
1623                 office.government
1623                        wheelchair
1623                    wheelchair.yes
Name: categories, Length: 4238, dtype: object


In [4]:
#Return a list of all the unique categories
print(df_cats.categories.unique())

categories = df_cats.categories.unique()

['office' 'office.company' 'building' 'building.office'
 'office.estate_agent' 'office.educational_institution' 'wheelchair'
 'wheelchair.limited' 'office.insurance' 'office.financial'
 'office.travel_agent' 'office.government' 'wheelchair.yes'
 'office.association' 'office.non_profit' 'service'
 'service.social_facility' 'office.financial_advisor'
 'office.energy_supplier' 'office.it' 'office.charity'
 'office.employment_agency' 'office.architect' 'office.lawyer'
 'office.government.administrative' 'office.political_party'
 'building.industrial' 'office.foundation' 'office.consulting'
 'office.logistics' 'building.residential' 'office.government.legislative'
 'office.diplomatic' 'building.public_and_civil'
 'office.government.public_service' 'office.government.migration'
 'education' 'education.school' 'activity' 'activity.community_center'
 'office.research' 'office.religion' 'office.notary' 'office.tax_advisor'
 'office.government.environment' 'office.advertising_agency'
 'office.ac

In [5]:
df_cats

Unnamed: 0,id,name,latitude,longitude,country,state,city,district,neighbourhood,suburb,street,postcode,categories
0,3295044257,Transportes La Ruta Ibérica,40.333346,-3.712226,Spain,Community of Madrid,Madrid,,,Villaverde,Calle Laguna del Marquesado,28091,office
0,3295044257,Transportes La Ruta Ibérica,40.333346,-3.712226,Spain,Community of Madrid,Madrid,,,Villaverde,Calle Laguna del Marquesado,28091,office.company
1,3295044256,"Solener, SA",40.333961,-3.713119,Spain,Community of Madrid,Madrid,,,Villaverde,Avenida Real de Pinto,28021,office
1,3295044256,"Solener, SA",40.333961,-3.713119,Spain,Community of Madrid,Madrid,,,Villaverde,Avenida Real de Pinto,28021,office.company
2,3158288079,"Toldos La Paella, SL",40.338650,-3.715053,Spain,Community of Madrid,Madrid,,,Villaverde,Calle del Valle de Tobalina,28091,office
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622,2697745528,Instituto de Física Teórica - IFT,40.549648,-3.687411,Spain,Community of Madrid,Madrid,Fuencarral-El Pardo,El Goloso,,Calle Nicolás Cabrera,28049,office.educational_institution
1623,5941188248,CEDEX,40.565918,-3.711884,Spain,Community of Madrid,Madrid,Fuencarral-El Pardo,El Goloso,,Autovía de Colmenar,28760,office
1623,5941188248,CEDEX,40.565918,-3.711884,Spain,Community of Madrid,Madrid,Fuencarral-El Pardo,El Goloso,,Autovía de Colmenar,28760,office.government
1623,5941188248,CEDEX,40.565918,-3.711884,Spain,Community of Madrid,Madrid,Fuencarral-El Pardo,El Goloso,,Autovía de Colmenar,28760,wheelchair


In [6]:
#Create a dictionary with all the main categories (before the dot) as keys and a list of all the subcategories as values
categories_dict = {}
for cat in categories:
    main_cat = cat.split(".")[0]
    #Throw out the categories that have more than one dot
    if len(cat.split(".")) > 2:
        continue
    if main_cat in categories_dict:
        categories_dict[main_cat].append(cat)
    else:
        categories_dict[main_cat] = [cat]

In [8]:
new_cats_dict = {k: v for k, v in categories_dict.items() if k in ["commercial", "production", "office", "service", "catering"]}
#print the number of values of all the values as keys
print({k: len(v) for k, v in new_cats_dict.items()})
new_cats_dict["commercial"]

{'office': 32, 'service': 5, 'commercial': 3, 'catering': 2}


['office',
 'office.company',
 'office.estate_agent',
 'office.educational_institution',
 'office.insurance',
 'office.financial',
 'office.travel_agent',
 'office.government',
 'office.association',
 'office.non_profit',
 'office.financial_advisor',
 'office.energy_supplier',
 'office.it',
 'office.charity',
 'office.employment_agency',
 'office.architect',
 'office.lawyer',
 'office.political_party',
 'office.foundation',
 'office.consulting',
 'office.logistics',
 'office.diplomatic',
 'office.research',
 'office.religion',
 'office.notary',
 'office.tax_advisor',
 'office.advertising_agency',
 'office.accountant',
 'office.coworking',
 'office.telecommunication',
 'office.newspaper',
 'office.security']