In [None]:
# requirements:
# functions-framework
# pandas
# requests
# bs4
# sqlalchemy
# pymysql

import functions_framework
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import sqlalchemy
import pymysql

@functions_framework.http
def insert_population_parameters(request):
  connection_string = connection()
  insert_population_data_in_SQL(connection_string)
  return "All Ok"

def connection():
    schema = "gans_data_pipeline"  # update "gans_data_pipeline" to your actual MySql database (schema) name
    host = "127.0.0.1"  # update with your own cloud SQL instance public IP address
    user = "root"  # update with your MySql username
    password = "password" # update "password" with your actual MySql password
    port = 3306  # update with your own default MySQL port, if different
    connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{schema}"
    return connection_string
    
def insert_population_data_in_SQL(connection_string):
    cities_population = []
    cities_from_sql = pd.read_sql("cities", con=connection_string)
    cities = cities_from_sql["city"].to_list()

    for city in cities:
      city_population = {}

      #connecting to website
      city_population ["city"] = city #adding city name to dictionnary
      city = city.replace(' ', '_') #replacing space for "_" in city names composed of two names, i.e. New York
      url = f'https://en.wikipedia.org/wiki/{city}' #target website for scraping
      response = requests.get(url)

      #creating soup
      if response.status_code == 200: #checking status code; if ok, proceed.
        soup = BeautifulSoup(response.content, 'html.parser') #creating soup.

        #scraping city population
        pop = soup.find(class_="infobox ib-settlement vcard").find(string="Population").find_next("td").get_text() #getting population information.
        pop = int(pop.replace(",", "")) #removing commas from figure and making it an integer
        city_population ["population"] = pop #sending info to dictionary

        #scraping population timestamp (year)
        def extract_year (date):
            pattern = r'\b\d{4}\b' # Regular expression pattern to match a four-digit year
            matches = re.findall(pattern, date)# Find all matches of the pattern in the input string
            if matches:# Extract the first match if available
                year = matches[0]
            else:
                print("No year found")
            return year
            
        yr = extract_year(soup.find(class_="infobox ib-settlement vcard").find(string="Population").next_sibling.get_text())#getting year for population information.


        city_population ["year"]= yr #sending info to dictionary
      
        cities_population.append(city_population)

    merged_population = cities_from_sql.merge((pd.DataFrame(cities_population)),
                                   on = "city",
                                   how="left")

    population_df = merged_population[['city_id','population','year']]

    population_from_sql = pd.read_sql('population', con=connection_string)

    def append_if_not_duplicate (df1, df2): #df1 = updates, df2 = existing_table
        unique_rows = df1[~df1.apply(tuple, axis=1).isin(df2.apply(tuple, axis=1))]
        
        updated_df2 = pd.concat([df2, unique_rows], ignore_index=True)
        

    updated_population_table = append_if_not_duplicate(population_df, population_from_sql)
    
    if updated_population_table:
        updated_population_table.to_sql('population',
                                        if_exists='append',
                                        con=connection_string,
                                        index=False)
    else:
        print("There is no new population information")
