# Seed contributors

By Ben Welsh

Seeds a master list of California Civic Data Coalition participants with open-source contributors drawn from the GitHub API. Last harvested on Dec. 18, 2016, [using a Python script that interacts with GitHub's API](https://github.com/california-civic-data-coalition/django-calaccess-raw-data/blob/master/example/network-analysis/contributors.csv).

In [996]:
import pandas as pd
import numpy as np

## Load in the data

In [999]:
table = pd.read_csv("./input/contributors.csv")

In [1000]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 9 columns):
repo             183 non-null object
login            183 non-null object
name             141 non-null object
email            93 non-null object
company          93 non-null object
location         118 non-null object
bio              27 non-null object
avatar_url       183 non-null object
contributions    183 non-null int64
dtypes: int64(1), object(8)
memory usage: 12.9+ KB


### Clean up strings

In [1001]:
table.replace(np.nan, "", inplace=True)

In [1002]:
table.login = table.login.map(str.strip).str.lower()
table.company = table.company.map(str.strip)
table.location = table.location.map(str.strip)
table.avatar_url = table.avatar_url.map(str.strip)

### Merge in corrections

In [1003]:
corrections = pd.read_csv("./input/contributors-corrections.csv")

In [1004]:
table = table.merge(corrections, on="login", how="left")

In [1005]:
table.name = table.corrected_name.fillna(table.name)
table.company = table.corrected_company.fillna(table.company)
table.location = table.corrected_location.fillna(table.location)
table.email = table.corrected_email.fillna(table.email)

In [1006]:
table.drop('corrected_name', axis=1, inplace=True)
table.drop('corrected_company', axis=1, inplace=True)
table.drop('corrected_location', axis=1, inplace=True)
table.drop('corrected_email', axis=1, inplace=True)

### Merge some common variations

In [1007]:
table.loc[table.location.isin(['Los Angeles', 'Los Angeles, California']), 'location'] = 'Los Angeles, CA'
table.loc[table.location.isin(['Washington D.C.', 'District of Columbia', 'Washington, D.C.']), 'location'] = 'Washington, DC'
table.loc[table.location == 'Chicago', 'location'] = 'Chicago, IL'
table.loc[table.location == 'San Francisco', 'location'] = 'San Francisco, CA'
table.loc[table.location == 'Palo Alto', 'location'] = 'Palo Alto, CA'
table.loc[table.location == 'Spokane, Wash.', 'location'] = 'Spokane, WA'
table.loc[table.location == 'Hackney, London', 'location'] = 'London, UK'
table.loc[table.location.isin(['Brooklyn', 'Brooklyn NY', 'Brooklyn, NY', 'NYC', 'New York']), 'location'] = 'New York, NY'
table.loc[table.location == 'Columbia, Missouri', 'location'] = 'Columbia, MO'
table.loc[table.location == 'Tucson, Arizona', 'location'] = 'Tucson, AZ'
table.loc[table.location == 'Toronto', 'location'] = 'Toronto, Canada'
table.loc[table.location == 'Salt Lake City, Utah', 'location'] = 'Salt Lake City, UT'
table.loc[table.location == 'Houston', 'location'] = 'Houston, TX'
table.loc[table.location == 'Orange County, Calif.', 'location'] = 'Houston, TX'

In [1008]:
table.company = table.company.str.replace("The ", "")
table.loc[table.company == 'Sunnmorsposten', 'company'] = 'Sunnmørsposten'
table.loc[table.company == 'Wall Street Journal.', 'company'] = 'Wall Street Journal'
table.loc[table.company == 'Northwestern University Knight Lab', 'company'] = 'Northwestern'
table.loc[table.company == 'Investigative News Network', 'company'] = 'Institute for Nonprofit News'
table.loc[table.company == 'Stanford', 'company'] = 'Stanford University'
table.loc[table.company == 'Missouri School of Journalism', 'company'] = 'University of Missouri'
table.loc[table.company == 'University of Iowa School of Journalism', 'company'] = 'University of Iowa'
table.loc[table.company == 'Knight-Mozilla fellow 2015', 'company'] = 'Mozilla OpenNews'
table.loc[table.company == 'Knight-Mozilla Fellow', 'company'] = 'Mozilla OpenNews'

### Identify gaps

#### People missing a name 

In [1009]:
len(table[table.name == ''])

13

In [1010]:
sorted(table[table.name == ''].login.unique())

['cecht',
 'jayelle-o',
 'karkinosw',
 'katbuchholz',
 'malon',
 'mb10',
 'mjlorda',
 'mmhirsch',
 'pumadegit',
 'regirob831',
 'samlo78',
 'soorinkimmm',
 'yujiap']

#### People missing a company

In [1011]:
len(table[table.company == ''])

15

In [1012]:
sorted(table[table.company == ''].login.unique())

['cecht',
 'jayelle-o',
 'karkinosw',
 'katbuchholz',
 'malon',
 'mb10',
 'mjlorda',
 'mmhirsch',
 'pumadegit',
 'regirob831',
 'samlo78',
 'soorinkimmm',
 'tocateunvals',
 'vromney',
 'yujiap']

#### People missing a location 

In [1013]:
len(table[table.location == ''])

14

In [1014]:
sorted(table[table.location == ''].login.unique())

['cecht',
 'jayelle-o',
 'karkinosw',
 'katbuchholz',
 'malon',
 'mb10',
 'mjlorda',
 'mmhirsch',
 'pumadegit',
 'regirob831',
 'samlo78',
 'soorinkimmm',
 'vromney',
 'yujiap']

#### People missing an email 

In [1015]:
len(table[table.email == ''])

32

In [1016]:
sorted(table[table.email == ''].login.unique())

['annkiha',
 'burtherman',
 'carloslemos',
 'caseymm',
 'cecht',
 'chagan',
 'danmit',
 'drtortoise',
 'ebonymarieb',
 'elainewong',
 'fagerlise',
 'jayelle-o',
 'jennbrandel',
 'karkinosw',
 'katbuchholz',
 'livlab',
 'malon',
 'mb10',
 'mijebner',
 'mjlorda',
 'mmhirsch',
 'pumadegit',
 'qstin',
 'regirob831',
 'samlo78',
 'soorinkimmm',
 'tocateunvals',
 'vromney',
 'yhy6f',
 'yujiap']

## Output unique list

In [1017]:
columns = [
    "login",
    "name",
    "email",
    "company",
    "location",
    "bio",
    "avatar_url"
]
unique_contributors = table.groupby(columns, as_index=False).contributions.sum()

In [1018]:
login_list = [
    'palewire',
    'gordonje',
    'sahilchinoy',
    'aboutaaron',
    'armendariz',
    'cephillips',
    'jlagetz'
]
unique_contributors['in_coalition'] = unique_contributors.login.isin(login_list)

### California v. everybody

In [1019]:
unique_contributors['in_california'] = False
unique_contributors.loc[unique_contributors.location.str.endswith(", CA"), 'in_california'] = True

### Count the different states and countries

In [1020]:
unique_contributors.loc[unique_contributors.location == '', 'in_usa'] = np.NaN
unique_contributors.loc[unique_contributors.location.str.contains(", \w{2}$"), 'in_usa'] = True
unique_contributors.loc[unique_contributors.location.str.contains(", \w{3,}$"), 'in_usa'] = False

In [1021]:
def split_state(val):
    if val == np.NaN:
        return val
    elif val == "":
        return np.NaN
    else:
        parent = val.split(", ")[1]
        if len(parent) == 2:
            return parent
        else:
            return np.NaN

In [1022]:
unique_contributors['state'] = unique_contributors['location'].apply(split_state)

In [1023]:
def split_country(val):
    if val == np.NaN:
        return val
    elif val == "":
        return np.NaN
    else:
        parent = val.split(", ")[1]
        if len(parent) == 2:
            return "United States of America"
        elif len(parent) > 2:
            return parent
        else:
            return np.NaN

In [1024]:
unique_contributors['country'] = unique_contributors['location'].apply(split_country)

## Output data

In [1025]:
unique_contributors.to_csv("./output/participants.csv")