In [None]:
import gzip as gz
import os
from io import StringIO
import pandas as pd
import datetime as DT
import numpy as np
import itertools
from scipy import stats

from bokeh.io import show, output_notebook
from bokeh.models import FactorRange
from bokeh.plotting import figure
from bokeh.layouts import column

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV, GridSearchCV

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

output_notebook()

import warnings
warnings.filterwarnings('ignore')

# Introduction

We put ourselves in the position of a manager in charge of a statewide campaign. As part of voter outreach, state and national campaigns typically include targeted mailings. Let's say that we have the resources to send 100,000 such letters. Our goal is to identify 100,000 registered voters who are most likely to be influenced by our campaign letters.

# Data preparation

The Ohio voter file (https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:STWD:::#stwdVtrFiles) contains a wealth of information about registered voters. We will use that data to develop a prediction model to aid in our voter outreach. The data set contains over 8 million entries and over 100 columns, so it takes some time to load.

In [None]:
#data_path = "Data"
#if not os.path.isdir(data_path):
#    os.mkdir(data_path)
#if len(os.listdir(data_path)) == 0:
#    !wget -O /Data/1.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:363
#    !wget -O /Data/2.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:364
#    !wget -O /Data/3.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:365
#    !wget -O /Data/4.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:366

In [None]:
data_path = "data"
files = [os.path.join(data_path, file) for file in os.listdir(data_path) 
         if os.path.isfile(os.path.join(data_path, file)) and file.endswith(".gz")]

In [None]:
csv_files = []
for file in files:
    with gz.open(file, "r") as z:
        file_content = z.read().decode("utf-8", errors='ignore')
        csv_files.append(file_content)

In [None]:
df_list = []
for csv_file in csv_files:
    df = pd.read_csv(StringIO(csv_file), index_col=None, header=0)
    df_list.append(df)
    
df = pd.concat(df_list, axis = 0, ignore_index = True)

In [None]:
df_reduced = df.iloc[:, [1, 3 , 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 31, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107]]
df_reduced.head()

In [None]:
#Development only
df_reduced = df_reduced.sample(n=1000000, replace=False)

now = pd.Timestamp(DT.datetime.now())
df_reduced['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'])
df_reduced['REGISTRATION_DATE'] = pd.to_datetime(df['REGISTRATION_DATE'])
df_reduced['DATE_OF_BIRTH'] = df_reduced['DATE_OF_BIRTH'].where(df_reduced['DATE_OF_BIRTH'] < now, df_reduced['DATE_OF_BIRTH'] -  np.timedelta64(100, 'Y'))
df_reduced['REGISTRATION_DATE'] = df_reduced['REGISTRATION_DATE'].where(df_reduced['REGISTRATION_DATE'] < now, df_reduced['REGISTRATION_DATE'] -  np.timedelta64(100, 'Y'))
df_reduced['AGE'] = (now - df_reduced['DATE_OF_BIRTH']).astype('<m8[Y]')
df_reduced['MONTHS_REGISTERED'] = (now - df_reduced['REGISTRATION_DATE']).astype('<m8[M]')
df_reduced = df_reduced[df_reduced['RESIDENTIAL_ZIP'].notnull()]
df_reduced['RESIDENTIAL_ZIP'] = df_reduced['RESIDENTIAL_ZIP'].astype(np.int)
df_reduced = df_reduced[(np.abs(stats.zscore(df_reduced['AGE'])) < 9)]

'AGE' is the only column that could have outliers. The data contained around 300 voters whose ages were over 9 standard deviations from the mean. These rows have been dropped.

We add the average income in a voter's zip code as a feature. The average income data is derived from the following data set: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2016-zip-code-data-soi. While the median income would be a better statistic, it is unfortunately not readily available for the recent years.

In [None]:
income_df = pd.read_csv(f"{data_path}/16zpallnoagi.csv", encoding = "ISO-8859-1")
income_df.index = income_df['ZIPCODE']
income_df['AVG_INCOME'] = income_df['A00200'] * 1000 / income_df['N00200']
avg_income = income_df['AVG_INCOME']
avg_income.head()

In [None]:
df_reduced = df_reduced[df_reduced["RESIDENTIAL_ZIP"].isin(avg_income.index)]
df_reduced["AVG_INCOME"] = df_reduced["RESIDENTIAL_ZIP"].apply(lambda x: avg_income[x])

The population density is certain to be an important feature in predicting someone's political views, as rural residents are in general more conservative and more likely to vote Republican. The population density by zip code data was obtained here: https://blog.splitwise.com/2014/01/06/free-us-population-density-and-unemployment-rate-by-zip-code/.

In [None]:
pop_density = pd.read_csv(f"{data_path}/Zipcode-ZCTA-Population-Density-And-Area-Unsorted.csv", encoding = "ISO-8859-1")
pop_density['Zip/ZCTA'] = pop_density['Zip/ZCTA'].astype(np.int)
pop_density.index = pop_density['Zip/ZCTA']
pop_density = pop_density['Density Per Sq Mile']
pop_density[pop_density > 0].head()

In [None]:
df_reduced = df_reduced[df_reduced["RESIDENTIAL_ZIP"].isin(pop_density.index)]
df_reduced["POP_DENSITY"] = df_reduced["RESIDENTIAL_ZIP"].apply(lambda x: pop_density[x])

In [None]:
voted_primary = df_reduced[df_reduced['PRIMARY-05/08/2018'].isin(['R', 'D']) 
                           & df_reduced['PRIMARY-05/08/2018'].isin(['R', 'D'])]
voted_primary['D'] = df_reduced['PRIMARY-05/08/2018'] == 'D'
voted_primary['R'] = df_reduced['PRIMARY-05/08/2018'] == 'R'

In [None]:
voted_primary.head()

# A bit of exploratory analysis

In [None]:
counts = [voted_primary.D.sum(), voted_primary.R.sum()]
parties = ["Democrat", "Republican"]
colors = ["blue", "red"]

p = figure(x_range=parties, plot_height=350, title="Number of Politically Active Voters By Party", toolbar_location=None, tools="")
p.vbar(x=parties, top=counts, width=0.9, alpha=0.5, fill_color=colors)
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [None]:
def pairwise(iterable):
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

income_levels = [i*10000 for i in range(2, 12)]
factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(income_levels)]))
incomes_R = [voted_primary[voted_primary.R & (voted_primary["AVG_INCOME"] > i) & (voted_primary["AVG_INCOME"] < j)].shape[0] for i, j in pairwise(income_levels)]
incomes_D = [voted_primary[voted_primary.D & (voted_primary["AVG_INCOME"] > i) & (voted_primary["AVG_INCOME"] < j)].shape[0] for i, j in pairwise(income_levels)]
incomes = list(itertools.chain(*zip(incomes_R, incomes_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(income_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Income By Party", toolbar_location=None, tools="")
p.vbar(x=factors, top=incomes, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.axis_label = "Incomes ($)"

show(p)

In [None]:
pop_min, pop_max = voted_primary["POP_DENSITY"].min(), voted_primary["POP_DENSITY"].max()
pop_levels = [0, 1000, 3000, np.inf] #np.linspace(pop_min, pop_max, 4)
pop_designations = ["rural", "suburban", "urban"]

factors = list(itertools.chain(*[((i, "R"), (i, "D")) for i in pop_designations]))
pop_R = [voted_primary[voted_primary.R & (voted_primary["POP_DENSITY"] > i) & (voted_primary["POP_DENSITY"] < j)].shape[0] for i, j in pairwise(pop_levels)]
pop_D = [voted_primary[voted_primary.D & (voted_primary["POP_DENSITY"] > i) & (voted_primary["POP_DENSITY"] < j)].shape[0] for i, j in pairwise(pop_levels)]
pops = list(itertools.chain(*zip(pop_R, pop_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(pop_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Population Level By Party", toolbar_location=None, tools="")
p.vbar(x=factors, top=pops, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

In [None]:
age_levels = [18, 25, 35, 45, 55, 65, 75, 85, 95, 100]

factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(age_levels)]))
age_R = [voted_primary[voted_primary.R & (voted_primary["AGE"] > i) & (voted_primary["AGE"] < j)].shape[0] for i, j in pairwise(age_levels)]
age_D = [voted_primary[voted_primary.D & (voted_primary["AGE"] > i) & (voted_primary["AGE"] < j)].shape[0] for i, j in pairwise(age_levels)]
ages = list(itertools.chain(*zip(age_R, age_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(age_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Age by Party", toolbar_location=None, tools="")
p.vbar(x=factors, top=ages, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

Let's calculate a couple more initeresting statistics from the 2018 midterm election:

In [None]:
def get_turnout(party, election, mask=None):
    if mask is None:
        return df_reduced.groupby(["PARTY_AFFILIATION"]).get_group(party).loc[:, election].notnull().sum() / (df_reduced["PARTY_AFFILIATION"] == party).sum()
    else:
        return df_reduced.groupby(["PARTY_AFFILIATION"]).get_group(party).loc[mask, election].notnull().sum() / (df_reduced["PARTY_AFFILIATION"][mask] == party).sum()
    
primary_turnouts = [get_turnout(party, 'PRIMARY-05/08/2018') for party in ['D', 'R']]
parties = ["Democrat", "Republican"]
colors = ["blue", "red"]

p = figure(x_range=parties, plot_height=500, title="Primary Turnout By Party", toolbar_location=None, tools="")
p.vbar(x=parties, top=primary_turnouts, width=0.9, alpha=0.5, fill_color=colors)
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [None]:
general_turnouts = [get_turnout(party, 'GENERAL-11/06/2018') for party in ['D', 'R']]
parties = ["Democrat", "Republican"]
colors = ["blue", "red"]

p = figure(x_range=parties, plot_height=500, title="General Election Turnout By Party", toolbar_location=None, tools="")
p.vbar(x=parties, top=general_turnouts, width=0.9, alpha=0.5, fill_color=colors)
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [None]:
age_levels = [18, 25, 35, 45, 55, 65, 75, 85, 95, 100]

factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(age_levels)]))
age_R = [get_turnout('R', 'PRIMARY-05/08/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
age_D = [get_turnout('D', 'PRIMARY-05/08/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
ages = list(itertools.chain(*zip(age_R, age_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(age_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Primary Election Turnout By Age", toolbar_location=None, tools="")
p.vbar(x=factors, top=ages, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

In [None]:
age_levels = [18, 25, 35, 45, 55, 65, 75, 85, 95, 100]

factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(age_levels)]))
age_R = [get_turnout('R', 'GENERAL-11/06/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
age_D = [get_turnout('D', 'GENERAL-11/06/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
ages = list(itertools.chain(*zip(age_R, age_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(age_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="General Election Turnout By Age", toolbar_location=None, tools="")
p.vbar(x=factors, top=ages, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

# Classification model selection

In [None]:
features = ['AGE', 'AVG_INCOME', 'POP_DENSITY']
target = 'D'

In [None]:
model_df = voted_primary[features + ['R', 'D']].dropna().reset_index(drop=True)
train_df, holdout_df = train_test_split(
    model_df, test_size=0.1)

train_df.reset_index(inplace=True)
holdout_df.reset_index(inplace=True)

(train_df.shape[0], holdout_df.shape[0])

## def get_metrics(classifier, test_data, test_labels):
    predicted_labels = classifier.predict(test_data)
    accuracy = accuracy_score(test_labels, predicted_labels)
    precision = precision_score(test_labels, predicted_labels)
    recall = recall_score(test_labels, predicted_labels)
    return accuracy, precision, recall

In [None]:
class RocPlot():
    def __init__(self, train_df, test_df, features, target):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.target = target
        self.auc_scores = {}
        self.plot = figure(title="ROC Curves", tools="", width=900)
        self.plot.legend.location = "top_left"
    
    def add(self, classifier, label, color, scaler = None):
        if scaler != None:
            train_data = scaler.transform(self.train_df[self.features])
            test_data = scaler.transform(self.test_df[self.features])
        else:
            train_data = self.train_df[self.features]
            test_data = self.test_df[self.features]
        train_y = self.train_df[self.target]
        test_y = self.test_df[self.target]
        
        classifier.fit(train_data, train_y)
        y_prob = classifier.predict_proba(test_data)
        
        fpr, tpr, thresh = roc_curve(test_y, y_prob[:,1])
        self.plot.line(fpr, tpr, color=color, line_width=2, legend=label)
        self.plot.xaxis.axis_label = "FPR"
        self.plot.yaxis.axis_label = "TPR"
        auc = roc_auc_score(test_y, y_prob[:,1])
        self.auc_scores[label] = auc
    
    def get_auc_scores(self):
        return self.auc_scores
    
    def show(self):
        show(self.plot)

In [None]:
df_reduced[features].describe()

In [None]:
def train_classifier(classifier, data, features, target, param_grid, cv=5):
    cv = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = cv, verbose=2, n_jobs = -1, return_train_score=True)
    cv.fit(data[features], data[target])
    return (cv.cv_results_, cv.best_params_)

def train_classifier_random(classifier, data, features, target, param_grid, cv=5, n_iter=10, random_state=42):
    cv = RandomizedSearchCV(estimator = classifier, param_distributions = param_grid, n_iter = n_iter, cv = cv, verbose=2, random_state=random_state, n_jobs = -1, return_train_score=True)
    cv.fit(data[features], data[target])
    return (cv.cv_results_, cv.best_params_)

In [None]:
roc_plot = RocPlot(train_df, holdout_df, features, target)

##### Logistic regression

In [None]:
c = [0.001,0.01,0.1,1,10,100]
penalty = ['l1', 'l2']

grid = {'C': c,
           'penalty': penalty}
classifier = LogisticRegression()
results, logistic_regression_best_params = train_classifier(classifier, train_df, features, target, grid)
logistic_regression_best_params

In [None]:
classifier = LogisticRegression(**logistic_regression_best_params)
classifier.fit(train_df[features], train_df[target])
get_metrics(classifier, holdout_df[features], holdout_df[target])

In [None]:
roc_plot.add(classifier, "Logistic Regression", "blue")

##### Decision Tree

In [None]:
min_samples_split = [2, 5, 7, 10, 15, 20, 50, 60, 70, 80, 90, 100, 120, 150]
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]

grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split}
classifier = DecisionTreeClassifier()
results, decision_tree_best_params = train_classifier(classifier, train_df, features, target, grid)
decision_tree_best_params

In [None]:
classifier = DecisionTreeClassifier(**decision_tree_best_params)
classifier.fit(train_df[features], train_df[target])
get_metrics(classifier, holdout_df[features], holdout_df[target])

In [None]:
roc_plot.add(classifier, "Decision Tree", "red")

##### Random Forest

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 10)]
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]
max_features = ['auto', 'sqrt']
max_depth.append(None)
min_samples_split = [500, 750, 1000, 1250, 1500]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
classifier = RandomForestClassifier()
results, random_forest_best_params = train_classifier_random(classifier, train_df, features, target, grid, cv=3, n_iter=10, random_state=42)
random_forest_best_params

In [None]:
classifier = RandomForestClassifier(**random_forest_best_params)
classifier.fit(train_df[features], train_df[target])
get_metrics(classifier, holdout_df[features], holdout_df[target])

In [None]:
roc_plot.add(classifier, "Random Forest", "green")

##### Gradient Boosting

In [None]:
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
max_depth = np.linspace(1, 32, 32, endpoint=True)
min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1,len(features)))

grid = {'learning_rate': learning_rate,
               'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}
classifier = GradientBoostingClassifier()
results, gradient_boosting_best_params = train_classifier_random(classifier, train_df, features, target, grid, cv=3, n_iter=100, random_state=42)
gradient_boosting_best_params

In [None]:
classifier = GradientBoostingClassifier(**gradient_boosting_best_params)
classifier.fit(train_df[features], train_df[target])
get_metrics(classifier, holdout_df[features], holdout_df[target])

In [None]:
roc_plot.add(classifier, "Gradient Boosting", "purple")

##### Multilayer Perceptron

In [None]:
hidden_layer_sizes=[(100,100,100)]
activation = ['identity', 'logistic', 'tanh', 'relu']
solver = ['lbfgs', 'sgd', 'adam']
learning_rate = ['constant', 'invscaling', 'adaptive']

grid = {
    'hidden_layer_sizes': hidden_layer_sizes,
    'activation': activation,
    'solver': solver,
    'learning_rate': learning_rate
}

scaler = StandardScaler()
classifier = MLPClassifier()
scaler.fit(train_df[features])
transformed_train_df = scaler.transform(train_df[features])
cv = RandomizedSearchCV(estimator = classifier, param_distributions = grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score=True)
cv.fit(transformed_train_df, train_df[target])
mlp_best_params = cv.best_params_
mlp_best_params

In [None]:
classifier = MLPClassifier(**mlp_best_params)
classifier.fit(transformed_train_df, train_df[target])
transformed_holdout_df = scaler.transform(holdout_df[features])
get_metrics(classifier, transformed_holdout_df, holdout_df[target])

In [None]:
roc_plot.add(classifier, "Multilayer Perceptron", "orange", scaler)

##### ROC plot

In [None]:
roc_plot.show()

In [None]:
roc_plot.get_auc_scores()

Conclusion: 

Logistic regression appears to be the worst-performing model. Decision trees, random forests and gradient boosting appear to exhibit the same performance, so we are going to choose the decision tree classiffier for our task. The accuracy is around 68%, which is not terrible given the limited number of features and the assumptions that are being made.

# Voter Selection

Before using our classification model, we need to narrow our data to a pool of voters who are most likely to be swayed by our campaign letters. It makes no sense to target voters who are registered with the opposite party, since most are not even going to read them. Similarly, it makes little sense to target voters who are registered with our party and vote regularly, since their votes are most likely already secured. Therefore, we are going to include those voters who are not affiliated with either party and the voters who are registered with our party but have not voted for at least two election cycles.

In [None]:
not_recent_voter = df_reduced[['PRIMARY-03/15/2016', 'GENERAL-06/07/2016', 'PRIMARY-09/13/2016',
       'GENERAL-11/08/2016', 'PRIMARY-05/02/2017', 'PRIMARY-09/12/2017',
       'GENERAL-11/07/2017', 'PRIMARY-05/08/2018', 'GENERAL-08/07/2018',
       'GENERAL-11/06/2018']].notnull().sum(1) == 0
target_party = df_reduced['PARTY_AFFILIATION'] == target
not_affiliated = df_reduced['PARTY_AFFILIATION'].isnull()

possible_choices = df_reduced[(target_party & not_recent_voter) | not_affiliated]
possible_choices.shape

In [None]:
classifier = DecisionTreeClassifier(**decision_tree_best_params)
classifier.fit(voted_primary[features], voted_primary[target])
predicted = classifier.predict_proba(possible_choices[features])
possible_choices[f"P({target})"] = predicted[:, 1]
possible_choices.head()

In [None]:
selected = possible_choices.nlargest(100000, [f"P({target})"])
selected[features].describe().loc['mean', :]

In [None]:
def make_histogram(data, title, x_axis_label, bins):
    hist, edges = np.histogram(data, density=True, bins=bins)
    p = figure(title=title, tools="")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="white", alpha=0.5)
    p.xaxis.axis_label = x_axis_label
    p.yaxis.visible = False
    return p

In [None]:
p = make_histogram(selected[f"P({target})"], f"Prediction Probabilities ({target})", "Probability", 10)
show(p)

Not bad - almost all of the selected voters were classified with the probability of over 75%. We can be confident that we are not wasting our resources by sending them letters.

In [None]:
age_p = make_histogram(selected["AGE"], f"Age distribution ({target})", "Age", 30)
show(age_p) 

In [None]:
pop_density_p = make_histogram(selected["POP_DENSITY"], f"Population density distribution ({target})", "Population density", 20)
show(pop_density_p)

In [None]:
income_p = make_histogram(selected[selected["AVG_INCOME"] < 150000]["AVG_INCOME"], f"Income distribution ({target})", "Income", 30)
show(income_p)

It looks like if we target Democrats, our model selects mostly younger, less affluent voters who reside in suburban and urban areas, which agrees with what we learned from our exploratory analysis.