# Dunham's Data

## Setting up

In [1]:
import numpy as np
import pandas as pd
import xlrd

In [2]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

## Loading data

### Checkins

In [3]:
checkins_df = pd.read_csv('data/out/checkins.csv', index_col=0)

In [4]:
checkins_df.columns

Index(['Source', 'Source Type', 'Notes', 'confidence',
       'Comprehensive Check In', 'Aikens, Vanoye', 'Alexander, James',
       'Aul, Ronnie', 'Bradley, Wilbert', 'Brooks, Jay', 'Burton, Miriam',
       'Canto, Umberto', 'Clay, Eddy', 'Destine, Jean Leon', 'Ellis, Lucille',
       'Frasier, Awilda', 'Gomez, Tommy'],
      dtype='object')

In [5]:
N = 5
len(checkins_df.columns[N:])

12

### Personnel

In [6]:
personnel_df = pd.read_excel('data/src/DunhamsData_PersonnelAttributes_DatasetSample.xlsx')
personnel_df.fillna('', inplace=True)

In [7]:
personnel_df.shape

(12, 2)

In [8]:
personnel_df.head()

Unnamed: 0,Name,public akas
0,"Aikens, Vanoye",Van Aikens
1,"Alexander, James",
2,"Aul, Ronnie",Ronne Aul
3,"Bradley, Wilbert",
4,"Brooks, Jay",


In [9]:
personnel_df.tail()

Unnamed: 0,Name,public akas
7,"Clay, Eddy",
8,"Destine, Jean Leon",
9,"Ellis, Lucille",
10,"Frasier, Awilda",
11,"Gomez, Tommy",Tommy Gomez Woosley; Thomas Woosley


## Pre-processing data

In [10]:
set(checkins_df.columns[N:]) == set(personnel_df.Name.unique())

True

In [11]:
def format_fullname(name):
    assert(',' not in name or ', ' in name)
    if ', ' in name:
        last, first = name.split(', ')
        r = first + ' ' + last
    else:
        r = name
    return r

## Processing data

In [12]:
def str2set(s):
    return set(x.strip() for x in s.split(';') if x.split())

def set2str(s):
    return ' / '.join(s)

In [13]:
public_akas_df = pd.DataFrame(columns=['SCREEN NAME', 'PUBLIC AKAS'])
for i, row in personnel_df.iterrows():
    name = row['Name']
    akas = row['public akas']
    screen_name = format_fullname(name)
    public_akas = str2set(akas)
    public_akas = screen_name + ' / ' + set2str(public_akas) if public_akas else screen_name
    public_akas_df.loc[i] = [screen_name, public_akas]

In [14]:
public_akas_df

Unnamed: 0,SCREEN NAME,PUBLIC AKAS
0,Vanoye Aikens,Vanoye Aikens / Van Aikens
1,James Alexander,James Alexander
2,Ronnie Aul,Ronnie Aul / Ronne Aul
3,Wilbert Bradley,Wilbert Bradley
4,Jay Brooks,Jay Brooks
5,Miriam Burton,Miriam Burton / Miriam Greenway / Mimi Burton
6,Umberto Canto,Umberto Canto / Cantigo
7,Eddy Clay,Eddy Clay
8,Jean Leon Destine,Jean Leon Destine
9,Lucille Ellis,Lucille Ellis


## Saving data

In [15]:
public_akas_df.to_csv('web/data/public_akas.csv', index=False)