In [None]:
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hypertools as hyp
from glob import glob as lsdir
import os
import re
import datetime as dt
import plotly_express as px

from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split

%matplotlib inline



# Read in data

In [None]:
fname = 'data/UVLTDataAnalysis.xls'
data = pd.read_excel(fname)

## Examining the raw data

In [None]:
data.head()

In [None]:
data.columns.values

## How much data are we working with?

In [None]:
data.shape

# Donation timeline analysis

Leading up to a donation, what sorts of patterns do we see?  For example, conditioned on donating in the current year, how much did people give in previous years?

Things to explore:
- Any donation amount vs. weighting bigger donations more vs. splitting into donation "quartiles"
- Looking at (U)nrestricted, (R)estricted, (V)olounteer, and (E)vent attendance "donations"
- Look to see whether the distributions are (roughly) unimodal or multi-modal
- Possibly break this down by geographic area

# Data wrangling

We will form these predictions using the following information from `data/UVLTDataAnalysis.xls`:
- Past history of unrestricted donations, restricted donations, volunteering, and event attendance for all years prior to the "current" year (we'll generate these histories for each year we have data for)
- Whether or not the person owns conserved land.  Assumption: land ownership did not change over the indicated period.
- Where the person's conserved land is located (if applicable). Assumption: land ownership did not change over the indicated period.
- Whether the person is still alive.

In [None]:
years = np.arange(2001, 2021) #years to include in analysis
def get_start_year(x):
    try:
        return int(re.search('-\d{4}-', x).group()[1:-1])
    except:
        return np.nan #no year found

def get_deceased_year(x, maxyear=np.inf):
    if type(x) == pd._libs.tslib.Timestamp:
        y = x.year
    elif np.isscalar(x):
        try:
            y = int(x)
        except:
            return np.nan
        
    try:
        if y < maxyear:
            return y
        else:            
            return np.nan
    except:        
        return np.nan #not a timestamp

In [None]:
#preprocessing
x = data.copy()
x.set_index('ContactID', inplace=True)
x.drop(['FirstName', 'LastName', 'City', 'TownID', 'Town', 'DeceasedDateYN'], axis=1, inplace=True)
x.drop(data.columns.values[np.where(np.array(['Tot' in x for x in data.columns.values]))[0]], axis=1, inplace=True)
x['DeceasedDate'] = x['DeceasedDate'].apply(get_deceased_year)

In [None]:
x.head()

## "Donation triggered average"

Conditioned on having donated in a given year, what does the historical data leading up to the current year look like?  Idea:

- For each year:
  - Split data into current year ("current") and data prior to the current year ("historical")
  - Adjust deceased date according to whether it would have been known (or not) in the current year

In [None]:
def get_current_and_historical_data(df, year):
    #historical data
    x = df[['State', 'ZipCode', 'LandOwnerTownID', 'DeceasedDate']].copy()
    x['DeceasedDate'] = x['DeceasedDate'].apply(lambda i: get_deceased_year(i, maxyear=year)) #remove all deaths before the given year    
    
    inds = np.array(list(map(get_start_year, df.columns.values))) < year
    x[df.columns.values[inds]] = df.iloc[:, inds]
    
    #current year
    inds = np.array(list(map(get_start_year, df.columns.values))) == year
    y = df[df.columns.values[inds]].copy()
    
    return x, y

In [None]:
historical, current = get_current_and_historical_data(x, 2010)

In [None]:
historical.head()

In [None]:
current.head()

### Formatting data to facilitate donation-triggered averaging

We'll create a dataframe where each contactID is repeated nyears-1 times.  The `U*`, `R*`, `V*`, and `E*` columns should be renamed to `U-10`, `U-9`, etc. indicated the number of years *prior* to the prediction year (everything before the earliest year with data should be set to nans).  Also include `U`, `R`, `V`, and `E` columns indicating the values of those columns on the prediction year.

In [None]:
def to_relative_years(df):
    years = np.array(list(map(get_start_year, df.columns.values)))
    maxyear = np.nanmax(years)
    minyear = np.nanmin(years)
    
    mapper = {}
    df = df.copy()
    if minyear == maxyear: #only one year; drop all years
        for c in df.columns.values[~np.isnan(years)]:
            mapper[c] = c[:-8]        
    else:
        for i, y in enumerate(years):
            if not np.isnan(y):
                c = df.columns.values[i]
                mapper[c] = c[:-7] + str(int(maxyear - y + 1))
    
    df.rename(mapper, inplace=True, axis=1)
    return df

In [None]:
columns = ['State', 'ZipCode', 'LandOwnerTownID', 'DeceasedDate']
categories = ['U', 'R', 'V', 'E']
for c in categories:
    columns.extend(list(map(lambda x: f'{c}-{x}', np.arange(len(years), 0, -1))))
columns.extend(categories)

In [None]:
df = pd.DataFrame(index=['ContactID'], columns=columns, data=[])

In [None]:
for y in years[1:-1]: #skip the first year (no history) and last year (no future data)
    next_historical, next_current = get_current_and_historical_data(x, y)
    next_merged = pd.concat([to_relative_years(next_historical), to_relative_years(next_current)], axis=1)
    df = pd.concat([df, next_merged], axis=0, copy=True)

In [None]:
#re-order columns
df = df[columns]

#drop nans in first row
df.drop(index=['ContactID'], inplace=True)

#rename index 
df.index.names = ['ContactID']

In [None]:
df.head()

In [None]:
df[df.columns.values[['U-' in x for x in df.columns.values]]].head()

In [None]:
# plot the weighted average donation history (of each type),
# conditioned on having given a U, R, V, or E donation in the current year
ignore_columns = ['State', 'ZipCode', 'LandOwnerTownID', 'DeceasedDate']
donation_types = ['U', 'R', 'V', 'E']

def weighted_average(x, weights):
    warnings.simplefilter('ignore')
    x = np.array(x, dtype=np.float)
    weights = np.array(weights, dtype=np.float)
    
    weighted_vals = np.multiply(x, weights)
    return np.divide(np.nansum(weighted_vals), np.sum(weights[~np.isnan(weighted_vals)]))

def donation_triggered_average(data, ignore_columns, donation_types, summary_fun=weighted_average):
    x = []
    columns = data.columns.values[np.where([(not x in ignore_columns) for x in data.columns.values])[0]][:-4]
    
    for d in donation_types:        
        donation_columns = columns[[f'{d}-' in x for x in columns]]
        next_df = pd.DataFrame(index=donation_types, columns=donation_columns, data=[])  
        for i in donation_types:            
            next_df.loc[i, :] = data[next_df.columns].apply(lambda x: summary_fun(x, data[i]))        
        x.append(next_df)
    return x

In [None]:
dta = donation_triggered_average(df, ignore_columns, donation_types)

In [None]:
units = ['\$', '\$', 'hrs', 'count']
for i, d1 in enumerate(donation_types): #plot ID-- what is being plotted leading up to moment of donation
    for j, d2 in enumerate(donation_types): #plot a line for each type of donation being predicted
        plt.plot(-(years[-1] - years), dta[i].loc[d2, :].values)
        plt.xlabel('Years relative to donation')
        plt.ylabel(f'Donation amount ({units[i]})')
        plt.title(f'{d1} donations')
    if i == 0:
        plt.legend([f'Predicting {x}' for x in donation_types])
    plt.show()