# Import


In [1]:
## importing the neccessary libraries 
## pandas - for creating dataframe for the collected data
## requests - for sending http request to the webpage we want to scrape
## bs4 - for parsing the chosen webpage
## gender_guesser - for guessing genders among the names we extracted

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import gender_guesser.detector as gender

# URL list

In [3]:
## list of url's we are aiming to to parse - Timespan 1989-2025

url_list = ["https://en.wikipedia.org/wiki/Presidency_of_George_H._W._Bush",
           "https://en.wikipedia.org/wiki/Presidency_of_Bill_Clinton",
           "https://en.wikipedia.org/wiki/Presidency_of_George_W._Bush",
           "https://en.wikipedia.org/wiki/Presidency_of_Barack_Obama",
           "https://en.wikipedia.org/wiki/Presidency_of_Donald_Trump",
           "https://en.wikipedia.org/wiki/Presidency_of_Joe_Biden"]

# Scraping data

In [4]:
## setting up gender_guesser
d = gender.Detector()

gov_list = []

def gov_gender(url):
    ## sending request for parsing
    r = requests.get(url)

    ## using bs4 to be able to parse the webpage code
    soup = bs(r.content, "html.parser")
    
    ## assigning the proper code snippets to collect the data we look for
    table = soup.find("table", class_="infobox", style="width:auto;text-align:left;line-height:1.2em;margin-left:1em; margin-right:0; float:right; clear:right;")

    rows = table.findAll("tr")
    
    
    df = pd.DataFrame({})
    
    ## finding the precise part(names) what we need and shaping them to useful data(only first names) 
    ## creating our column name and putting the first names into the column
    col_name = rows[1].findAll("th")[1].get_text(" ", strip=True)    
    
    names = [r.th.get_text(" ", strip=True) for r in rows if r.th!= None][2:]
    first_names = [n.split(" ")[0] for n in names]
    
    df[col_name] = first_names
    
    ## guessing the genders among the collected first names and putting them into our df
    genders = [d.get_gender(name) for name in first_names]
    df["Genders"] = genders
    
    gov_list.append(df)
    
    return df

In [5]:
## competely the same method with one tiny adjustment

def gov_gender_p(url):
    
    r = requests.get(url)

    soup = bs(r.content, "html.parser")
    table = soup.find("table", class_="infobox", style="width:auto;text-align:left;line-height:1.2em;margin-left:1em; margin-right:0; float:right; clear:right;")

    rows = table.findAll("tr")
    
    df = pd.DataFrame({})
    
    ## the structure of the parsed parts of the webpages are different a bit - used rows[2].findAll() instead of rows[1].findAll()
    col_name = rows[2].findAll("th")[1].get_text(" ", strip=True)  
    
    names = [r.th.get_text(" ", strip=True) for r in rows if r.th!= None][2:]
    surnames = [n.split(" ")[0] for n in names]
    df[col_name] = surnames
        
    genders = [d.get_gender(name) for name in surnames]
    df["Genders"] = genders
    
    gov_list.append(df)
    
    return df

In [6]:
## creating our dataframes

for i, url in enumerate(url_list):
    
    if i == 3 or i == 5:
        gov_gender_p(url)
    
    else:
        gov_gender(url)

# Women in USA Administrations 1989-2025 

Using the prior created dataframes to get our answers we looked for - Women in USA Administrations 1989-2025.

In [7]:
gdf = pd.DataFrame({}) 
time_span = []
gender_nums = []
gov_num = []
sum_w = []
start = 1989
end = 1993

for i, df in enumerate(gov_list):
    sum_female = (df["Genders"] == "female").sum()
    prop = "{:.0%}".format(sum_female / len(df))
    gov_len = len(df)
    idoszak = str(start)+"-"+ str(end)
    
    time_span.append(idoszak)
    gender_nums.append(prop)
    gov_num.append(gov_len)
    sum_w.append(sum_female)
    
    if i == 0:
        start += 4
        end += 8
    elif i == 1 or i == 2:
        start += 8
        end += 8
    elif i == 3:
        start += 8
        end += 4
    else:
        start += 4
        end += 4
    
gdf["Term"] = time_span   
gdf["Women proportion"] = gender_nums  
gdf["Cabinet size"] = gov_num
gdf["Number of women"] = sum_w

In [8]:
gdf

Unnamed: 0,Term,Women proportion,Cabinet size,Number of women
0,1989-1993,11%,27,3
1,1993-2001,17%,59,10
2,2001-2009,10%,49,5
3,2009-2017,30%,54,16
4,2017-2021,14%,43,6
5,2021-2025,42%,26,11
