In [1]:
## importing the neccessary libraries 
## pandas - for creating dataframe for the collected data
## requests - for sending http request to the webpage we want to scrape
## bs4 - for parsing the chosen webpage
## gender_guesser - for guessing genders among the names we extracted

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import gender_guesser.detector as gender

In [2]:
## setting up gender_guesser
d = gender.Detector()

In [3]:
## list of url's we are aiming to to parse

url_list = ["https://hu.wikipedia.org/wiki/Antall-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/Horn-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/Els%C5%91_Orb%C3%A1n-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/Medgyessy-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/Els%C5%91_Gyurcs%C3%A1ny-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/M%C3%A1sodik_Gyurcs%C3%A1ny-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/M%C3%A1sodik_Orb%C3%A1n-korm%C3%A1ny",
 "https://hu.wikipedia.org/wiki/Harmadik_Orb%C3%A1n-korm%C3%A1ny",
"https://hu.wikipedia.org/wiki/Negyedik_Orb%C3%A1n-korm%C3%A1ny"]

In [4]:
gov_list = []

def gov_gender(url):
    
    ## sending request for parsing
    r = requests.get(url)
    
    ## using bs4 to be able to parse the webpage code
    soup = bs(r.content, "html.parser")
    
    ## assigning the proper code snippets to collect the data we look for
    table = soup.find("table", class_="wikitable")
    rows = table.findAll("tr")

    df= pd.DataFrame({})
    
    ## finding the precise part(names) what we need and shaping them to useful data(only first names) 
    ## creating our column name and putting the first names into the column
    col_name = rows[0].find("th").get_text(" ", strip=True)
    
    names = [r.td.get_text(" ", strip=True) for r in rows if r.td!= None if not r.td.find("b")]
    surnames = [n.split(" ")[1] for n in names]
    
    df[col_name] = surnames
    
    ## guessing the genders among the collected first names and putting them into our df
    genders = [d.get_gender(name) for name in surnames]
    df["Genders"] = genders
    
    ## adding the created df to the gov_list
    gov_list.append(df)

    return df

In [5]:
## creating our dataframes

for url in url_list:
    
    gov_gender(url)

In [6]:
## using the dataframes to get our answers we looked for - Women in hungarian Administrations 1990-2022 

gdf = pd.DataFrame({}) 
time_span = []
gender_nums = []
gov_num = []
sum_w = []
start = 1990
end = 1994

for i, df in enumerate(gov_list):
    sum_female = (df["Genders"] == "female").sum()
    prop = "{:.0%}".format(sum_female / len(df))
    gov_len = len(df)
    idoszak = str(start)+"-"+ str(end)
    
    time_span.append(idoszak)
    gender_nums.append(prop)
    gov_num.append(gov_len)
    sum_w.append(sum_female)
    
    if i == 2:
        start += 4
        end += 2
    elif i == 3:
        start += 2
        end += 2
    elif i == 4:
        start += 2
        end += 4
    else:
        start += 4
        end += 4
    
gdf["Term"] = time_span  
gdf["Women proportion"] = gender_nums  
gdf["Cabinet size"] = gov_num
gdf["Number of women"] = sum_w

In [7]:
gdf

Unnamed: 0,Term,Women proportion,Cabinet size,Number of women
0,1990-1994,3%,37,1
1,1994-1998,4%,27,1
2,1998-2002,3%,30,1
3,2002-2004,15%,26,4
4,2004-2006,9%,22,2
5,2006-2010,15%,33,5
6,2010-2014,6%,16,1
7,2014-2018,0%,16,0
8,2018-2022,19%,16,3


In [8]:
## since the terms of the US administrations followed by the same party are not separated on the wiki pages regarding the cabinet members, 
## thus to have an overview and to compare easier, I merged the terms and the numbers of the same party hungarian administrations, too.

adf = pd.DataFrame({})

longspan1 = "2002-2010"
longspan2 = "2010-2022"

sum_female1 = (gov_list[3]["Genders"] == "female").sum() + (gov_list[4]["Genders"] == "female").sum() + (gov_list[5]["Genders"] == "female").sum()
sum_female2 = (gov_list[6]["Genders"] == "female").sum() + (gov_list[7]["Genders"] == "female").sum() + (gov_list[8]["Genders"] == "female").sum()

gov_len1 = len(gov_list[3]) + len(gov_list[4]) + len(gov_list[5])
prop1 = "{:.0%}".format(sum_female1 / gov_len1)
                                                       
gov_len2 = len(gov_list[6]) + len(gov_list[7]) + len(gov_list[8])      
prop2 = "{:.0%}".format(sum_female2 / gov_len2)

adf["Term"] = [longspan1, longspan2]
adf["Women proportion"] = [prop1, prop2]
adf["Cabinet size"] = [gov_len1, gov_len2]
adf["Number of women"] = [sum_female1, sum_female2]

In [9]:
## df of the merged same party terms
adf

Unnamed: 0,Term,Women proportion,Cabinet size,Number of women
0,2002-2010,14%,81,11
1,2010-2022,8%,48,4
