# Capstone Project - The Battle of Neighborhoods (Week 1)

## Moving to Madrid

First, we build the code to scrape the following Wikipedia page, https://es.wikipedia.org/wiki/Anexo:Barrios_administrativos_de_Madrid, in order to obtain the data that is in the table and to transform the data into a pandas dataframe.

#### First, we import the libraries we expect to need:

In [3]:
pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)
[K     |████████████████████████████████| 112kB 7.3MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.0 soupsieve-2.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 5.1MB/s eta 0:00:01     |████                            | 706kB 5.1MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [14]:
import csv
import os
import requests
import urllib
import math
import copy
import pandas as pd	
import numpy as np
from bs4 import BeautifulSoup 

class html_tables(object):
    
    def __init__(self, url):
        
        self.url      = url
        self.r        = requests.get(self.url)
        self.url_soup = BeautifulSoup(self.r.text)
        
    def read(self):
        
        self.tables      = []
        self.tables_html = self.url_soup.find_all("table")
        
        # Parse each table
        for n in range(0, len(self.tables_html)):
            
            n_cols = 0
            n_rows = 0
            
            for row in self.tables_html[n].find_all("tr"):
                col_tags = row.find_all(["td", "th"])
                if len(col_tags) > 0:
                    n_rows += 1
                    if len(col_tags) > n_cols:
                        n_cols = len(col_tags)
            
            # Create dataframe
            df = pd.DataFrame(index = range(0, n_rows), columns = range(0, n_cols))
            
            # Create list to store rowspan values 
            skip_index = [0 for i in range(0, n_cols)]
            
            # Start by iterating over each row in this table...
            row_counter = 0
            for row in self.tables_html[n].find_all("tr"):
                
                # Skip row if it's blank
                if len(row.find_all(["td", "th"])) == 0:
                    next
                
                else:
                    
                    # Get all cells containing data in this row
                    columns = row.find_all(["td", "th"])
                    col_dim = []
                    row_dim = []
                    col_dim_counter = -1
                    row_dim_counter = -1
                    col_counter = -1
                    this_skip_index = copy.deepcopy(skip_index)
                    
                    for col in columns:
                        
                        # Determine cell dimensions
                        colspan = col.get("colspan")
                        if colspan is None:
                            col_dim.append(1)
                        else:
                            col_dim.append(int(colspan))
                        col_dim_counter += 1
                            
                        rowspan = col.get("rowspan")
                        if rowspan is None:
                            row_dim.append(1)
                        else:
                            row_dim.append(int(rowspan))
                        row_dim_counter += 1
                            
                        # Adjust column counter
                        if col_counter == -1:
                            col_counter = 0  
                        else:
                            col_counter = col_counter + col_dim[col_dim_counter - 1]
                            
                        while skip_index[col_counter] > 0:
                            col_counter += 1

                        # Get cell contents  
                        cell_data = col.get_text()
                        
                        # Insert data into cell
                        df.iat[row_counter, col_counter] = cell_data

                        # Record column skipping index
                        if row_dim[row_dim_counter] > 1:
                            this_skip_index[col_counter] = row_dim[row_dim_counter]
                
                # Adjust row counter 
                row_counter += 1
                
                # Adjust column skipping index
                skip_index = [i - 1 if i > 0 else i for i in this_skip_index]

            # Append dataframe to list of tables
            self.tables.append(df)
        
        return(self.tables)

#### Then, we download the data from Wikipedia 

In [2]:
ssa_url = "https://es.wikipedia.org/wiki/Anexo:Barrios_administrativos_de_Madrid"
ssa = html_tables(ssa_url)
first_table = ssa.read()[0]
first_table.to_csv("ssa.csv", header = False, index = False)

In [3]:
first_table.head()

Unnamed: 0,0,1,2,3,4
0,Distrito\n,Número\n,Nombre\n,Superficie (km²)[2]​\n,Imagen\n
1,Centro\n,11\n,Palacio\n,"1,471 km²\n",\n
2,,12\n,Embajadores\n,"1,032 km²\n",\n
3,,13\n,Cortes\n,"0,592 km²\n",\n
4,,14\n,Justicia\n,"0,742 km²\n",\n


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
neighborhoods_Madrid=pd.read_csv('ssa.csv') 

In [6]:
neighborhoods_Madrid.head()

Unnamed: 0,Distrito\n,Número\n,Nombre\n,Superficie (km²)[2]​\n,Imagen\n
0,Centro\n,11,Palacio\n,"1,471 km²\n",\n
1,,12,Embajadores\n,"1,032 km²\n",\n
2,,13,Cortes\n,"0,592 km²\n",\n
3,,14,Justicia\n,"0,742 km²\n",\n
4,,15,Universidad\n,"0,947 km²\n",\n


We can drop the unneeded columns

In [7]:
del neighborhoods_Madrid['Número\n']
del neighborhoods_Madrid['Superficie (km²)[2]​\n']
del neighborhoods_Madrid['Imagen\n']
neighborhoods_Madrid.head()

Unnamed: 0,Distrito\n,Nombre\n
0,Centro\n,Palacio\n
1,,Embajadores\n
2,,Cortes\n
3,,Justicia\n
4,,Universidad\n


In [8]:
neighborhoods_Madrid2 = neighborhoods_Madrid

We change the columns names:

In [13]:
neighborhoods_Madrid = neighborhoods_Madrid.rename(columns={'Distrito\n':'Borough', 'Nombre\n':'Neighborhood'}, 
                 inplace=True)

In [15]:
neighborhoods_Madrid.head()

Unnamed: 0,Borough,Neighborhood
0,Centro\n,Palacio\n
1,,Embajadores\n
2,,Cortes\n
3,,Justicia\n
4,,Universidad\n


We need to remove the "\n":

In [16]:
neighborhoods_Madrid = neighborhoods_Madrid.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=False)
neighborhoods_Madrid.head()

Unnamed: 0,Borough,Neighborhood
0,Centro,Palacio
1,,Embajadores
2,,Cortes
3,,Justicia
4,,Universidad


On the Wikipedia page Boroughs were merged cells, that is why we have NaN values on Borough column. We should replace these values with the Borough above it. 

In [17]:
neighborhoods_Madrid3 = neighborhoods_Madrid

In [18]:
neighborhoods_Madrid3 = neighborhoods_Madrid3.fillna(method='ffill')
neighborhoods_Madrid3.head()

Unnamed: 0,Borough,Neighborhood
0,Centro,Palacio
1,Centro,Embajadores
2,Centro,Cortes
3,Centro,Justicia
4,Centro,Universidad


In [19]:
neighborhoods_Madrid3.tail()

Unnamed: 0,Borough,Neighborhood
126,Barajas,Alameda de Osuna
127,Barajas,Aeropuerto
128,Barajas,Casco Histórico de Barajas
129,Barajas,Timón
130,Barajas,Corralejos


Finally, we save this data to a CSV file.

In [12]:
neighborhoods_Madrid3.to_csv('Madrid_data.csv')

NameError: name 'neighborhoods_Madrid3' is not defined

### Use geopy library to get the latitude and longitude values of Madrid

First, we load Madrid data:

In [15]:
neighborhoods_Madrid3=pd.read_csv('Madrid_data.csv') 

In [16]:
print('Madrid has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods_Madrid3['Borough'].unique()),
        neighborhoods_Madrid3.shape[0]
    )
)

Madrid has 21 boroughs and 131 neighborhoods.


We install the needed libraries

In [22]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 6.0MB/s ta 0:00:011
[?25hCollecting click (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/d2/3d/fa76db83bf75c4f8d338c2fd15c8d33fdd7ad23a9b5e57eb6c5de26b430e/click-7.1.2-py2.py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 4.9MB/s eta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting future (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 15.5MB/s eta 0:00:01
Building wheel

In [25]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [2]:
!conda install -c conda-forge geopy --yes 

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0



Downloading and Extracting Packages
geopy-1.21.0         | 58 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##################################### |

In [3]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Let's find out Madrid's latitude and longitude:

In [4]:
address = 'Madrid'

geolocator = Nominatim(user_agent="madrid_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Madrid are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Madrid are 40.4167047, -3.7035825.


Let's check if we can get the latitude and longitude of a Neighborhood.

In [5]:
address = 'Centro, Madrid'


location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Madrid Centro are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Madrid Centro are 40.417652700000005, -3.7079137662915533.


In [7]:
import geocoder

We create a function to insist on getting the coordinates until the value is obtained.

In [8]:
def get_latlng(address):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Madrid, Spain'.format(address))
        lat_lng_coords = g.latlng
    return lat_lng_coords
get_latlng('Corralejos, Barajas, Madrid')

[40.465400000000045, -3.6116399999999658]

As seen above, we can get the coordinates of each neighborhood with the address. So we are going to add a column with the address by concatenating Borough and Neighborhood columns and adding "Madrid".

In [21]:
# making copy of team column 
new = neighborhoods_Madrid3["Borough"].copy() 
  
# concatenating in a new column 
neighborhoods_Madrid3["Address"]= neighborhoods_Madrid3["Neighborhood"].str.cat(new, sep =", ") 
neighborhoods_Madrid3.head()

Unnamed: 0.1,Unnamed: 0,Borough,Neighborhood,Address
0,0,Centro,Palacio,"Palacio, Centro"
1,1,Centro,Embajadores,"Embajadores, Centro"
2,2,Centro,Cortes,"Cortes, Centro"
3,3,Centro,Justicia,"Justicia, Centro"
4,4,Centro,Universidad,"Universidad, Centro"


In [22]:
neighborhoods_Madrid3["Address"]=neighborhoods_Madrid3["Address"] + ", Madrid"

In [23]:
neighborhoods_Madrid3.head()

Unnamed: 0.1,Unnamed: 0,Borough,Neighborhood,Address
0,0,Centro,Palacio,"Palacio, Centro, Madrid"
1,1,Centro,Embajadores,"Embajadores, Centro, Madrid"
2,2,Centro,Cortes,"Cortes, Centro, Madrid"
3,3,Centro,Justicia,"Justicia, Centro, Madrid"
4,4,Centro,Universidad,"Universidad, Centro, Madrid"


We can drop the first column.

In [24]:
del neighborhoods_Madrid3['Unnamed: 0']
neighborhoods_Madrid3.head()

Unnamed: 0,Borough,Neighborhood,Address
0,Centro,Palacio,"Palacio, Centro, Madrid"
1,Centro,Embajadores,"Embajadores, Centro, Madrid"
2,Centro,Cortes,"Cortes, Centro, Madrid"
3,Centro,Justicia,"Justicia, Centro, Madrid"
4,Centro,Universidad,"Universidad, Centro, Madrid"


Let's add in the pandas dataframe 2 new columns for the coordinates:

In [25]:
neighborhoods_Madrid3["Latitude"] = ""
neighborhoods_Madrid3["Longitude"] = ""
neighborhoods_Madrid3.head()

Unnamed: 0,Borough,Neighborhood,Address,Latitude,Longitude
0,Centro,Palacio,"Palacio, Centro, Madrid",,
1,Centro,Embajadores,"Embajadores, Centro, Madrid",,
2,Centro,Cortes,"Cortes, Centro, Madrid",,
3,Centro,Justicia,"Justicia, Centro, Madrid",,
4,Centro,Universidad,"Universidad, Centro, Madrid",,


We call the defined function for every Address.

In [27]:
addresses = neighborhoods_Madrid3['Address']    
coords = [ get_latlng(address) for address in addresses.tolist() ]

We insert latitud and longitude of each neighborhood in the pandas dataframe:

In [28]:
neighborhoods_Madrid3_coord = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
neighborhoods_Madrid3['Latitude'] = neighborhoods_Madrid3_coord['Latitude']
neighborhoods_Madrid3['Longitude'] = neighborhoods_Madrid3_coord['Longitude']
neighborhoods_Madrid3.head()

Unnamed: 0,Borough,Neighborhood,Address,Latitude,Longitude
0,Centro,Palacio,"Palacio, Centro, Madrid",40.40963,-3.87979
1,Centro,Embajadores,"Embajadores, Centro, Madrid",40.39107,-3.69273
2,Centro,Cortes,"Cortes, Centro, Madrid",40.41641,-3.69887
3,Centro,Justicia,"Justicia, Centro, Madrid",40.42446,-3.69672
4,Centro,Universidad,"Universidad, Centro, Madrid",40.42565,-3.70726


We save it in a new CSV file:

In [29]:
neighborhoods_Madrid3.to_csv('Madrid_data_coord.csv')