# Web scraping

Web scraping is a technique used to extract data from websites. It involves sending HTTP requests to websites, parsing the returned HTML code, and extracting the desired data. Web scraping is a powerful tool for data scientists as it allows them to collect large amounts of data from the web. This data can then be used to train machine learning models, analyse trends, and make informed business decisions.

---
## 1.&nbsp; Import libraries 💾

In [67]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

---
## 2.&nbsp; Beautiful Soup 🍲

[Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) is a Python library that simplifies the process of web scraping. It provides a user-friendly interface for parsing HTML documents, enabling users to extract specific information from websites. Through Beautiful Soup, you can navigate the HTML tree structure, locate elements based on their tags, attributes, and content, and extract the desired data into a structured format.


Beautiful Soup's HTML parser takes the raw, unruly HTML code and transforms it into a neatly organised tree structure, making the information easily accessible and manageable.

In [68]:
from bs4 import BeautifulSoup
from datetime import datetime # to get today's date

In [69]:
#!pip install lat-lon-parser

In [70]:
from lat_lon_parser import parse
# For changing the latitutde and longitude to decimal format

In [79]:
def create_wiki(city_list): 
#empty list for dataframe that will be filled with info for each city in the city_list
    list_for_df = []

    for city in city_list:
          # Wikipedia's universal structure for cities
          url = f"https://en.wikipedia.org/wiki/{city}"
          # Creating the soup for each city
          response = requests.get(url)
          soup = BeautifulSoup(response.content,'html.parser')
        
          #empty dictionary to be filled with city infromation
          dict_for_cities ={}
        
          dict_for_cities['city_name'] = soup.select('#firstHeading')[0].get_text()
          # Extracting country information
          dict_for_cities['country_name'] = soup.select('td.infobox-data')[0].get_text(strip=True)
          # To find the population
          if soup.select_one('th.infobox-header:-soup-contains("Population")'):
              city_population = soup.select_one('th.infobox-header:-soup-contains("Population")').parent.find_next_sibling().find(string=re.compile(r'\d+'))
              city_population_clean = int(city_population.replace(",", ""))
              dict_for_cities['population'] = city_population_clean
          # Adding Timestamp
          today = datetime.today().strftime("%d.%m.%Y")
          dict_for_cities['timestamp'] = today
          # Parsing to convert them into decimals
          latitude = soup.select('span.latitude')[0].get_text()
          dict_for_cities['latitude'] = round(parse(latitude), 6)
          longitude = soup.select('span.longitude')[0].get_text()
          dict_for_cities['longitude'] = round(parse(longitude), 6)
  
          list_for_df.append(dict_for_cities)

    return list_for_df

In [80]:
cities = ["Berlin", 'Hamburg','Munich', 'London','Prague','Istanbul']
create_wiki(cities)

[{'city_name': 'Berlin',
  'country_name': 'Germany',
  'population': 3596999,
  'timestamp': '15.03.2025',
  'latitude': 52.52,
  'longitude': 13.405},
 {'city_name': 'Hamburg',
  'country_name': 'Germany',
  'population': 1964021,
  'timestamp': '15.03.2025',
  'latitude': 53.55,
  'longitude': 10.0},
 {'city_name': 'Munich',
  'country_name': 'Germany',
  'population': 1510378,
  'timestamp': '15.03.2025',
  'latitude': 48.1375,
  'longitude': 11.575},
 {'city_name': 'London',
  'country_name': 'United Kingdom',
  'population': 8866180,
  'timestamp': '15.03.2025',
  'latitude': 51.507222,
  'longitude': -0.1275},
 {'city_name': 'Prague',
  'country_name': 'Czech Republic',
  'population': 1384732,
  'timestamp': '15.03.2025',
  'latitude': 50.0875,
  'longitude': 14.421389},
 {'city_name': 'Istanbul',
  'country_name': 'Turkey',
  'population': 15701602,
  'timestamp': '15.03.2025',
  'latitude': 41.013611,
  'longitude': 28.955}]

In [81]:
 # make the DataFrame
cities_df = pd.DataFrame(create_wiki(cities))
cities_df

Unnamed: 0,city_name,country_name,population,timestamp,latitude,longitude
0,Berlin,Germany,3596999,15.03.2025,52.52,13.405
1,Hamburg,Germany,1964021,15.03.2025,53.55,10.0
2,Munich,Germany,1510378,15.03.2025,48.1375,11.575
3,London,United Kingdom,8866180,15.03.2025,51.507222,-0.1275
4,Prague,Czech Republic,1384732,15.03.2025,50.0875,14.421389
5,Istanbul,Turkey,15701602,15.03.2025,41.013611,28.955


In [82]:
#!pip install sqlalchemy
#!pip install pymysql
import sqlalchemy

In [83]:
import pandas as pd

In [84]:
schema = "cities"  # Your database name
host = "127.0.0.1"       # Your server (localhost if running locally)
user = "root"            # Your MySQL username
password = "568923Ebru0079."  # Your MySQL password
port = 3306              # Default MySQL port

# Create the connection string for SQLAlchemy
connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{schema}"


In [85]:
from sqlalchemy import create_engine
import pandas as pd

# Create the engine
engine = create_engine(connection_string)


In [86]:
countries_unique = cities_df['country_name'].unique()
countries_unique_df = pd.DataFrame({'country_name' : countries_unique})
countries_unique_df

Unnamed: 0,country_name
0,Germany
1,United Kingdom
2,Czech Republic
3,Turkey


In [87]:
countries_unique_df.to_sql('countries',
                  # we want to append not overwrite
                  if_exists='append',
                  con=connection_string,
                  index=False)

4

In [93]:
# Bringing back the country_name info from SQL to Python 
pd.read_sql(
            ''' SELECT DISTINCT country_name
                FROM countries '''
            ,
            con=connection_string)

Unnamed: 0,country_name
0,Germany
1,United Kingdom
2,Czech Republic
3,Turkey


In [94]:
# Bringing back the countries table from SQL to Python 
countries_from_sql = pd.read_sql("countries", con=connection_string)
countries_from_sql

Unnamed: 0,country_id,country_name
0,1,Germany
1,2,United Kingdom
2,3,Czech Republic
3,4,Turkey


In [95]:
# Merging cities table with country table from SQL 
cities_table_df = cities_df.merge(countries_from_sql,
                                   left_on="country_name",
                                   right_on="country_name", 
                                   how="left")

cities_table_df

Unnamed: 0,city_name,country_name,population,timestamp,latitude,longitude,country_id
0,Berlin,Germany,3596999,15.03.2025,52.52,13.405,1
1,Hamburg,Germany,1964021,15.03.2025,53.55,10.0,1
2,Munich,Germany,1510378,15.03.2025,48.1375,11.575,1
3,London,United Kingdom,8866180,15.03.2025,51.507222,-0.1275,2
4,Prague,Czech Republic,1384732,15.03.2025,50.0875,14.421389,3
5,Istanbul,Turkey,15701602,15.03.2025,41.013611,28.955,4


In [96]:
cities_table_df = cities_table_df.drop(columns=["country_name"])

cities_table_df

Unnamed: 0,city_name,population,timestamp,latitude,longitude,country_id
0,Berlin,3596999,15.03.2025,52.52,13.405,1
1,Hamburg,1964021,15.03.2025,53.55,10.0,1
2,Munich,1510378,15.03.2025,48.1375,11.575,1
3,London,8866180,15.03.2025,51.507222,-0.1275,2
4,Prague,1384732,15.03.2025,50.0875,14.421389,3
5,Istanbul,15701602,15.03.2025,41.013611,28.955,4


In [102]:
cities_table_df['timestamp'] = pd.to_datetime(cities_table_df['timestamp'])

datetime64[ns]


In [103]:
cities_table_df.to_sql('cities',
                if_exists='append',
                con=engine,
                index=False)

6