# SERP Level Proportions

In [57]:
import pandas as pd
import csv
from collections import Counter
import os
import json
from datetime import datetime
from urllib.parse import urlparse

In [58]:
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# Canonical URL: https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-2:US
#
# Automatically Generated 2024-10-08 07:45:06 via Jupyter Notebook from
# https://gist.github.com/rogerallen/d75440e8e5ea4762374dfd5c1ddf84e0 

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands, U.S.": "VI",
}

In [59]:
def govDomains(path): 
    """Creates a dictionary of government domain location data, where keys are (clean) domains 
    and values are dictionary of location information.

    path: csv file of government domain location information from
      https://github.com/cisagov/dotgov-data/blob/main/current-full.csv
    """

    info = {}
    with open(path, "r") as f:
        r = csv.DictReader(f)
        for row in r:
            dom = row['Domain name']
            state = row['State'] if row['State'] != "" else None
            # city, county, and state/local websites all include state info
            # so None indicates that the domain is federal or native and should not
            # be counted as an incorrectly located domain
            info[dom] = state

    #print(info)
    return info

In [60]:
def govAffDomains(path): 
    """Creates a dictionary of government-affiliated domain location data, where keys are (clean) domains 
    and values are dictionary of location information.

    path: csv file of government-affiliated domain location information from
      https://github.com/GSA/govt-urls/blob/main/1_govt_urls_full.csv
    """

    info = {}
    with open(path, "r") as f:
        r = csv.DictReader(f)
        for row in r:
          # print(row.keys())
          dom = row['\ufeffDomain name']
          state = row['State'] if row['State'] != "" else None
          # local, county, regional, and state websitess all include state info
          # so None indicates that the domain is federal, native, or quasigovernmental
          # and should not be counted as an incorrectly located domain
          if state in us_state_to_abbrev.keys():
            state = us_state_to_abbrev[state] # changing full names in original csv to abbreviation
          info[dom] = state
    
    #print(info)
    return info

In [61]:
def analyze(resultsPath, govDict, govAffDict):

    govResults = []

    # load results
    with open(resultsPath, "r", encoding='utf8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # fixing old parsing errors
            domain = urlparse(row['link']).netloc
            domain = domain.replace('www.','')

            if domain in govDict.keys():
                gov_type = 'government'
                gov_state = govDict[domain] # saving the state associated with the gov domain
                #print("wow! a wild gov domain has appeared")
            elif domain in govAffDict.keys():
                gov_type = 'government affiliated'
                gov_state = govAffDict[domain] # saving the state associated with the gov domain
                #print("wow! a wild govaff domain has appeared")
            else:
                gov_type = None
                gov_state = None

            # checking to see if location is state-accurate
            accuracy = None
            if gov_state:
                if gov_state == row['state']:
                    accuracy = 1
                else:
                    accuracy = 0

            row['domain'] = domain
            row['gov_type'] = gov_type
            row['gov_state'] = gov_state
            row['accuracy'] = accuracy
            govResults.append(row)
            
    return govResults

In [62]:
def run():
    resultsPath = "/Users/enistudent/Desktop/spanish-english-audit/spanishCorrectedStates.csv" # CSV of parsed organic results
    govPath = "/Users/enistudent/Desktop/spanish-english-audit/current-full.csv" # CSV of .gov domains
    govAffPath = "/Users/enistudent/Desktop/spanish-english-audit/gov_associated_urls_full.csv" # CSV of government-affiliated domains
    outPath = "/Users/enistudent/Desktop/spanish-english-audit/spanLocAccuracy.csv"

    # load government sites into dicts
    govDict = govDomains(govPath)
    govAffDict = govAffDomains(govAffPath)

    govResults = analyze(resultsPath, govDict, govAffDict)

    fieldnames = ['date', 'location', 'query', 'computer', 'domain', 
        'title', 'org-position', 'link', 'bolded', 'type', 'state', 'gov_type', 'gov_state', 'accuracy']

    with open(outPath, "w", encoding='utf8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(govResults)

Main

In [63]:
def main():
    run()

In [64]:
main()