In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hypertools as hyp
from glob import glob as lsdir
import os

%matplotlib inline

In [48]:
data_readers = {'xlsx': pd.read_excel, 'xls': pd.read_excel, 'dta': pd.read_stata}
get_extension = lambda x: x.split('.')[-1]

In [58]:
def read_data(datadir, readers):
    files = lsdir(os.path.join(datadir, '*'))
    readable_files = []
    data = []
    for f in files:
        ext = get_extension(f)
        if ext in readers.keys():
            readable_files.append(f)
            data.append(data_readers[ext](f))
    return readable_files, data

In [59]:
fnames, data = read_data('data', data_readers)



In [63]:
data[0].head()

Unnamed: 0,ContactID,State,TownID,Town,LandOwnerTownID,DeceasedDateYN,U_Tot_Amt,U_Tot_Cnt,U200001,U200102,...,E201112,E201213,E201314,E201415,E201516,E201617,E201718,E201819,DeceasedDate,ConservedOwner
0,1544.0,NC,60.0,All Other Towns,0.0,0.0,571.95,6.0,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,1545.0,NH,20.0,Lebanon,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,1546.0,NH,61.0,All Other Towns NH,0.0,0.0,600.0,7.0,25.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,1547.0,VT,41.0,Weathersfield,41.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0
4,1548.0,VT,34.0,Springfield,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016.0,0.0


In [64]:
data[1].head()

Unnamed: 0,ContactID,State,TownID,Town,LandOwnerTownID,DeceasedDateYN,U_Tot_Amt,U_Tot_Cnt,U200001,U200102,...,E201819,DeceasedDate,ConservedOwner,Nprojects,Nacres,Nmembers,MedianHHIncome,MeanHHIncome,PercBAplus,PercAge55Plus
0,2903.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
1,11472.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
2,9206.0,VT,2.0,Bradford,0.0,0.0,75.0,2.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
3,12910.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
4,5029.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306


In [65]:
data[2].head()

Unnamed: 0,ID,Town,DATE,AMOUNT,CODE,LIST,NOTES,Gave Again FY 19
0,14661.0,28.0,2017-12-01 00:00:00,50.0,170930.0,The New Yorker,,
1,1838.0,16.0,2017-12-04 00:00:00,25.0,170924.0,NWF,,
2,14664.0,9.0,2017-12-04 00:00:00,20.0,170929.0,Sierra Club,,X
3,13889.0,12.0,2017-12-04 00:00:00,50.0,170924.0,NWF,previous-Smith Pond,X
4,14667.0,15.0,2017-12-06 00:00:00,25.0,170926.0,TNC,,X


In [66]:
data[3].head()

Unnamed: 0,ContactID,FirstName,LastName,City,State,ZipCode,TownID,Town,LandOwnerTownID,DeceasedDate,...,E-2010-11,E-2011-12,E-2012-13,E-2013-14,E-2014-15,E-2015-16,E-2016-17,E-2017-18,E-2018-19,E-2019-20
0,1544,Charles ...,Pitman,Chapel Hill,NC,27517,60,All Other Towns,0,NaT,...,0,0,0,0,0,0,0,0,0,0
1,1545,Pat,Reed,Lebanon,NH,3766,20,Lebanon,0,NaT,...,0,0,0,0,0,0,0,0,0,0
2,1546,David,Davenport,New London,NH,3257,61,All Other Towns NH,0,NaT,...,0,0,0,0,0,0,0,0,0,0
3,1547,Patricia,Stevens,Ascutney,VT,5030,41,Weathersfield,41,NaT,...,0,0,0,0,0,0,0,0,0,0
4,1548,Margaret Elizabeth,Stevens,Springfield,VT,5156,34,Springfield,0,2016-05-08,...,0,0,0,0,0,0,0,0,0,0


In [67]:
data[4].head()

Unnamed: 0,TownID,Town,Nprojects,Nacres,Nmembers,MedianHHIncome,MeanHHIncome,PercBAplus,PercAge55Plus
0,1,Bath,9,746.0,0,47386,60413,25.274725,48.844538
1,2,Bradford,20,2258.32,7,48056,58716,30.6,32.912306
2,3,Canaan,9,1676.35,10,58333,68870,25.3,33.86352
3,4,Cavendish,0,0.0,0,48750,69230,31.001727,40.359043
4,5,Charlestown,2,408.9,2,42693,50823,14.3,36.050905


In [68]:
data[5].head()

Unnamed: 0,TownID,Town,Nprojects,Nacres,Nmembers,MedianHHIncome,MeanHHIncome,PercBAplus,PercAge55Plus
0,1,Bath,9,746.0,0,47386,60413,25.274725,48.844538
1,2,Bradford,20,2258.32,7,48056,58716,30.6,32.912306
2,3,Canaan,9,1676.35,10,58333,68870,25.3,33.86352
3,4,Cavendish,0,0.0,0,48750,69230,31.001727,40.359043
4,5,Charlestown,2,408.9,2,42693,50823,14.3,36.050905


In [69]:
data[6].head()

Unnamed: 0,Town #,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Total Housing Units,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,1.0,Bath,366,10,3,589.0,,,
1,2.0,Bradford,822,13,0,1407.0,,,
2,3.0,Canaan,1140,8,0,1867.0,,,
3,4.0,Cavendish,510,40,3,1013.0,,,
4,5.0,Charlestown,1738,0,0,2338.0,,,


In [71]:
list(map(np.shape, data))

[(13927, 90), (13934, 97), (27, 8), (13273, 97), (45, 9), (45, 9), (48, 9)]

In [72]:
fnames

['data/UVLTdata_individual.dta',
 'data/UVLTdata_final.dta',
 'data/Direct Mailing Analysis.xlsx',
 'data/UVLTDataAnalysis.xls',
 'data/TownLevelData.xlsx',
 'data/TownLevelData.dta',
 'data/CensusInfoUpperValley2015JH.xlsx']

Ideas:
- Prediction (regression, deep learning) about donors and volunteers
- Data cleaning (multiple entries per person)
- PDFs --> convert anything useful into computer-readable formats
- Get more data?  How do marketing efforts differ with income?
- Confounds and how to deal with them

- Do donors come from specific towns?
- How many people are actually donating?
- Where do most donations come from?
- Donation amounts by various demographics or characteristics

Libraries or approaches to explore:
- regression (sklearn)
- deep learning
- 3d plots
- images in ads (google cloud image processing?)
- synthetic data, simulations