<div style="display: flex; align-items: center;">
    <img src="../figures/Mines Geophysics Black Moon Circle Waves 3.3.png" alt="Example Image" width="10%">
    <div style="margin-left: 10px;">
        <h1>GP100 Alumni Map</h1>
        <h2>Data Analysis</h2>
    </div>
</div>

### Loading the Data

In [12]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests

In [13]:
# load data into a dataframe
path = '../data/GP_Alumni_List.csv'
df = pd.read_csv(path)

### Cleaning the Data

In [14]:
# clean the dataframe

#rename StateOrProvince to State
df.rename(columns={'StateOrProvince': 'State'}, inplace=True)

#delete rows if Affiliation is not alumni
df = df[df['Affiliation'] == 'Alumni']

#clean Degree column to include only degree level
validDegrees = ['BSc','MSc','PhD']
df['Degrees'] = df['Degrees'].fillna('').apply(lambda x: [deg.strip() for deg in x.split() if deg.strip() in validDegrees])

#split people with multiple degrees into separate rows
df = df.explode('Degrees')

#replace NaN in country column with United States or Canada based on StateorProvince
usStates = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE',
                                   'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS',
                                   'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS',
                                   'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
                                   'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
                                   'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV',
                                   'WI', 'WY','PR']
df['Country'] = df.apply(lambda row: 'United States' if row['State'] in usStates else row['Country'], axis=1)

#remove province from Canada
df['State'] = np.where(df['Country'] == 'Canada', '', df['State'])

df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Affiliation,PrefClassYear,Degrees,State,Country,Latitude,Longitude
0,Alumni,1983.0,BSc,CO,United States,39.783730,-100.445882
1,Alumni,1995.0,BSc,CO,United States,39.783730,-100.445882
2,Alumni,2013.0,MSc,,Bahrain,26.155125,50.534461
3,Alumni,1999.0,MSc,,United Kingdom,54.702354,-3.276575
4,Alumni,1982.0,BSc,OK,United States,39.783730,-100.445882
...,...,...,...,...,...,...,...
2225,Alumni,2012.0,BSc,CO,United States,39.783730,-100.445882
2226,Alumni,1982.0,BSc,TX,United States,39.783730,-100.445882
2227,Alumni,1981.0,BSc,UT,United States,39.783730,-100.445882
2228,Alumni,2019.0,BSc,CO,United States,39.783730,-100.445882


In [15]:
df.to_csv('../data/GP_Alumni_List_Cleaned.csv', index=False)

In [16]:
#for the purposes of plotting, we need to assign each location a longitude and latitude
'''reference:
https://gis.stackexchange.com/questions/212796/getting-latlon-extent-of-country-by-its-name-using-python

def get_boundingbox_country(country, output_as='boundingbox'):
    """
    get the bounding box of a country in EPSG4326 given a country name

    Parameters
    ----------
    country : str
        name of the country in english and lowercase
    output_as : 'str
        chose from 'boundingbox' or 'center'. 
         - 'boundingbox' for [latmin, latmax, lonmin, lonmax]
         - 'center' for [latcenter, loncenter]

    Returns
    -------
    output : list
        list with coordinates as str
    """
    # create url
    url = '{0}{1}{2}'.format('http://nominatim.openstreetmap.org/search?country=',
                             country,
                             '&format=json&polygon=0')
    response = requests.get(url).json()[0]

    # parse response to list
    if output_as == 'boundingbox':
        lst = response[output_as]
        output = [float(i) for i in lst]
    if output_as == 'center':
        lst = [response.get(key) for key in ['lat','lon']]
        output = [float(i) for i in lst]
    return output
'''

'reference:\nhttps://gis.stackexchange.com/questions/212796/getting-latlon-extent-of-country-by-its-name-using-python\n\ndef get_boundingbox_country(country, output_as=\'boundingbox\'):\n    """\n    get the bounding box of a country in EPSG4326 given a country name\n\n    Parameters\n    ----------\n    country : str\n        name of the country in english and lowercase\n    output_as : \'str\n        chose from \'boundingbox\' or \'center\'. \n         - \'boundingbox\' for [latmin, latmax, lonmin, lonmax]\n         - \'center\' for [latcenter, loncenter]\n\n    Returns\n    -------\n    output : list\n        list with coordinates as str\n    """\n    # create url\n    url = \'{0}{1}{2}\'.format(\'http://nominatim.openstreetmap.org/search?country=\',\n                             country,\n                             \'&format=json&polygon=0\')\n    response = requests.get(url).json()[0]\n\n    # parse response to list\n    if output_as == \'boundingbox\':\n        lst = response[o

In [18]:
def get_boundingbox_country(country, output_as='boundingbox'):
    """
    get the bounding box of a country in EPSG4326 given a country name

    Parameters
    ----------
    country : str
        name of the country in English and lowercase
    output_as : str
        choose from 'boundingbox' or 'center'. 
         - 'boundingbox' for [latmin, latmax, lonmin, lonmax]
         - 'center' for [latcenter, loncenter]

    Returns
    -------
    output : list
        list with coordinates as str
    """
    # Check if country name is not NaN
    if isinstance(country, str) or isinstance(country, np.float64):
        if isinstance(country, np.float64):
            country = str(country)  # Convert NaN float to string
        
        # Create URL
        url = 'http://nominatim.openstreetmap.org/search?country=' + country + '&format=json&polygon=0'
        
        try:
            response = requests.get(url).json()
            if response:
                if output_as == 'boundingbox':
                    lst = response[0][output_as]
                    output = [float(i) for i in lst]
                elif output_as == 'center':
                    lst = [response[0].get(key) for key in ['lat','lon']]
                    output = [float(i) for i in lst]
                else:
                    print("Error: Invalid output format specified")
                    return None
                return output
            else:
                print(f"Error: Unable to find coordinates for {country}")
                return None
        except Exception as e:
            print(f"Error: {e}")
            return None
    else:
        print(f"Error: Invalid country name: {country}")
        return None

# count unique states, provinces, and countries
uniqueStates = df['State'].unique()
uniqueCountries = df['Country'].unique()

# create a dictionary to store country centers
countryCenters = {}

# get latitude and longitude centers for each country
for country in uniqueCountries:
    center = get_boundingbox_country(country, output_as='center')
    if center:
        countryCenters[country] = center

#manually add some latitudes and longtitudes
countryCenters['Taiwan, Province of China'] = ['23.6978','120.9605']
countryCenters['Tanzania, United Republic Of'] = ['-6.3690','34.8888']

# add latitude and longitude centers to the df
df['Latitude'] = df['Country'].map(lambda x: countryCenters.get(x, [None, None])[0])
df['Longitude'] = df['Country'].map(lambda x: countryCenters.get(x, [None, None])[1])

# save updated df to GP_Alumni_List_Cleaned.csv
df.to_csv(path, index=False)

Error: Invalid country name: nan
Error: Unable to find coordinates for Tanzania, United Republic Of
Error: Unable to find coordinates for Taiwan, Province Of China


In [10]:
# actual states in the data, includes Puerto Rico (PR) and Armed Forces Americas (AA) , 'PR', 'AA'
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 
          'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 
          'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 
          'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 
          'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
          'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 
          'WI', 'WY']

states = {'AL': (),
          'AK': (),
          'AZ': (),
          'AR': (),
          'CA': (),
          'CO': (),
          'CT': (),
          'DE': (),
          'FL': (),
          'GA': (),
          'HI': (),
          'ID': (),
          'IL': (),
          'IN': (),
          'IA': (),
          'KS': (),
          'KY': (),
          'CO': (),
          'LA': (),
          'ME': (),
          'MD': (),
          'MA': (),
          'MI': (),
          'MN': (),
          'MS': (),
          'MO': (),
          'MT': (),
          'NE': (),
          'NV': (),
          'NH': (),
          'NJ': (),
          'NM': (),
          'NY': (),
          'NC': (),
          'ND': (),
          'OH': (),
          'OK': (),
          'OR': (),
          'PA': (),
          'RI': (),
          'SC': (),
          'SD': (),
          'TN': (),
          'TX': (),
          'UT': (),
          'VT': (),
          'VA': (),
          'WA': (),
          'WV': (),
          'WI': (),
          'WY': (),
          'PR': (),
          'AA': ()}

GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=AL&format=json&limit=1 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)')))