# Ecological Inference through Tsallis Regularized Optimal Transport (TROT)
This notebook presents the pipeline used in (cite our paper) to perform ecological inference on the Florida dataset.

You will first want to download the dataset from (url to the dataset)

In [9]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
from matplotlib.pylab import savefig

import sys
sys.path.append('..')
sys.path.append('../Trot')
sys.path.append('../Data')

from Trot import Distances
from Trot.Evaluation import KL
from Trot.Florida_inference import CV_Local_Inference, Local_Inference

# Data Loading and Processing

In [10]:
FlData = pd.read_csv('../Fl_Data.csv', usecols = ['District', 'County','Voters_Age', 'Voters_Gender', 'PID', 'vote08', 
                    'SR.WHI', 'SR.BLA', 'SR.HIS', 'SR.ASI', 'SR.NAT', 'SR.OTH'])

FlData = FlData.dropna()

KeyboardInterrupt: 

Change gender values to numerical values

In [None]:
FlData['Voters_Gender'] = FlData['Voters_Gender'].map({'M': 1, 'F': 0})

Renormalize the age so that it takes values between 0 and 1

In [None]:
FlData['Voters_Age'] = ((FlData['Voters_Age'] -
                         FlData['Voters_Age'].min()) /
                        (FlData['Voters_Age'].max() -
                         FlData['Voters_Age'].min()))


One-hot party subscriptions (PID)

In [None]:
#Get one hot encoding of column PID
one_hot = pd.get_dummies(FlData['PID'])
# Drop column PID as it is now encoded
FlData = FlData.drop('PID', axis=1)
# Join the encoded df
FlData = FlData.join(one_hot)
# Rename the new columns
FlData.rename(columns={0: 'Other', 1: 'Democrat', 2: 'Republican'},
              inplace=True)

In [None]:
FlData.describe()

# Compute Marginals and Joint Distributions

Create a county dictionnary

In [None]:
Voters_By_County = {}
all_counties = FlData.County.unique()
for county in all_counties:
    Voters_By_County[county] = FlData[FlData['County'] == county]

Compute the ground truth joint distribution

In [None]:
J = {}
for county in all_counties:
    J[county] = np.zeros((6, 3))

    J[county][0,0] = Voters_By_County[county].loc[(Voters_By_County[county]['Other'] ==1) & (Voters_By_County[county]['SR.WHI']==1)].shape[0]
    J[county][0,1] = Voters_By_County[county].loc[(Voters_By_County[county]['Democrat'] ==1) & (Voters_By_County[county]['SR.WHI']==1)].shape[0]
    J[county][0,2] = Voters_By_County[county].loc[(Voters_By_County[county]['Republican'] ==1) & (Voters_By_County[county]['SR.WHI']==1)].shape[0]

    J[county][1,0] = Voters_By_County[county].loc[(Voters_By_County[county]['Other'] ==1) & (Voters_By_County[county]['SR.BLA']==1)].shape[0]
    J[county][1,1] = Voters_By_County[county].loc[(Voters_By_County[county]['Democrat'] ==1) & (Voters_By_County[county]['SR.BLA']==1)].shape[0]
    J[county][1,2] = Voters_By_County[county].loc[(Voters_By_County[county]['Republican'] ==1) & (Voters_By_County[county]['SR.BLA']==1)].shape[0]

    J[county][2,0] = Voters_By_County[county].loc[(Voters_By_County[county]['Other'] ==1) & (Voters_By_County[county]['SR.HIS']==1)].shape[0]
    J[county][2,1] = Voters_By_County[county].loc[(Voters_By_County[county]['Democrat'] ==1) & (Voters_By_County[county]['SR.HIS']==1)].shape[0]
    J[county][2,2] = Voters_By_County[county].loc[(Voters_By_County[county]['Republican'] ==1) & (Voters_By_County[county]['SR.HIS']==1)].shape[0]

    J[county][3,0] = Voters_By_County[county].loc[(Voters_By_County[county]['Other'] ==1) & (Voters_By_County[county]['SR.ASI']==1)].shape[0]
    J[county][3,1] = Voters_By_County[county].loc[(Voters_By_County[county]['Democrat'] ==1) & (Voters_By_County[county]['SR.ASI']==1)].shape[0]
    J[county][3,2] = Voters_By_County[county].loc[(Voters_By_County[county]['Republican'] ==1) & (Voters_By_County[county]['SR.ASI']==1)].shape[0]

    J[county][4,0] = Voters_By_County[county].loc[(Voters_By_County[county]['Other'] ==1) &(Voters_By_County[county]['SR.NAT']==1)].shape[0]
    J[county][4,1] = Voters_By_County[county].loc[(Voters_By_County[county]['Democrat'] ==1) & (Voters_By_County[county]['SR.NAT']==1)].shape[0]
    J[county][4,2] = Voters_By_County[county].loc[(Voters_By_County[county]['Republican'] ==1) & (Voters_By_County[county]['SR.NAT']==1)].shape[0]

    J[county][5,0] = Voters_By_County[county].loc[(Voters_By_County[county]['Other'] ==1) & (Voters_By_County[county]['SR.OTH']==1)].shape[0]
    J[county][5,1] = Voters_By_County[county].loc[(Voters_By_County[county]['Democrat'] ==1) & (Voters_By_County[county]['SR.OTH']==1)].shape[0]
    J[county][5,2] = Voters_By_County[county].loc[(Voters_By_County[county]['Republican'] ==1) & (Voters_By_County[county]['SR.OTH']==1)].shape[0]

    J[county] /= J[county].sum()

In [None]:
print(J[12])

Compute the party marginals

In [None]:
Party_Marginals = {}
parties = ['Other', 'Democrat', 'Republican']
for county in all_counties:
    Party_Marginals[county] = pd.Series([J[county][:, i].sum()
                                        for i in np.arange(3)])
    Party_Marginals[county].index = parties

Compute the ethnicity marginals

In [None]:
Ethnicity_Marginals = {}
ethnies = ['SR.WHI', 'SR.BLA', 'SR.HIS', 'SR.ASI', 'SR.NAT', 'SR.OTH']
for county in all_counties:
    Ethnicity_Marginals[county] = pd.Series([J[county][i, :].sum()
                                             for i in np.arange(6)])
    Ethnicity_Marginals[county].index = ethnies

# Compute the cost matrix
Using only age, gender, and 2008 vote or abstention

In [None]:
features = ['Voters_Age', 'Voters_Gender', 'vote08']
e_len, p_len = len(ethnies), len(parties)
M = np.zeros((e_len, p_len))
for i, e in enumerate(ethnies):
    data_e = FlData[FlData[e] == 1.0]
    average_by_e = data_e[features].mean(axis=0)
    for j, p in enumerate(parties):
        data_p = FlData[FlData[p] == 1.0]
        average_by_p = data_p[features].mean(axis=0)

        M[i, j] = np.array(dist.dist_2(average_by_e, average_by_p))

# Start the inference

Use a specific county or district to select the best parameters

In [None]:
CV_counties = FlData[FlData['District'] == 3].County.unique()

Find the best parameters

In [None]:
q = np.arange(0.5, 2.1, 0.1)
l = [0.01, 0.1, 1., 10., 100., 1000.] 

best_score, best_q, best_l = CV_Local_Inference(Voters_By_County, M, J, Ethnicity_Marginals, Party_Marginals,
                   CV_counties,q,l)

Use selected parameters on the rest of the dataset

In [None]:
J_inferred = Local_Inference(Voters_By_County, M, J, Ethnicity_Marginals, Party_Marginals, all_counties, best_q, best_l)
kl, std = KL(J, J_inferred, all_counties, save_to_file=False, compute_abs_err=True)

# Plot the results

In [None]:
diag = np.linspace(-0.1, 1.0, 100)

# pickle results
f = open('joints_gallup.pkl', 'rb')
J_true, J = pickle.load(f)

f = open('baseline.pkl', 'rb')
J_baseline = pickle.load(f)

j_true, j, j_baseline = [], [], []
for c in all_counties:
    j_true.append(np.array(J_true[c]).flatten())
    j.append(np.array(J_inferred[c]).flatten())
    j_baseline.append(np.array(J_baseline[c]).flatten())

j_true = np.array(j_true).flatten()
j = np.array(j).flatten()
j_baseline = np.array(j_baseline).flatten()

Plot the correlation between the ground truth for the joint distribution and the infered distribution (the closer to the $x = y$ diagonal axis, the better

In [None]:
plt.figure()
plt.scatter(j_true, j, alpha=0.5)
plt.xlabel('Ground truth')
plt.ylabel('TROT (RBF)')
plt.plot(diag, diag, 'r--')

plt.show()

Plot the distribution of the error (the more packed around the origin of the $x$-axis, the better)

In [None]:
plt.figure()
bins = np.arange(-.3, .6, 0.01)
plt.hist(j_true - j, bins=bins, alpha=0.5, label='TROT')
plt.hist(j_true - j_baseline, bins=bins, alpha=0.5, label='Florida-average')
plt.legend()
plt.xlabel('Difference between inference and ground truth')

plt.show()

# Survey-based ecological inference
Same pipeline, but using a cost matrix computed thanks to the 2013 Gallup survey. (http://www.gallup.com/poll/160373/democrats-racially-diverse-republicans-mostly-white.aspx)

We assume that Gallup's Other = {Native, Other}

The cost matrix M is computed as $1-p_{ij}$, where $p_{ij}$ is the proportion of people registered to party $j$ belonging to group $i$.

In [None]:
M_sur = np.array([
               [.38, .26, .35],
               [.29, .64, .05],
               [.50, .32, .13],
               [.46, .36, .17],
               [.49, .32, .18],
               [.49, .32, .18]
               ])
M_sur = (1. - M_sur)

Once again, find the best parameters

In [None]:
best_score, best_q, best_l = CV_Local_Inference(Voters_By_County, M_sur, J, Ethnicity_Marginals, Party_Marginals,
                   CV_counties,q,l)

Using these parameters, run the inference on the rest of the dataset

In [None]:
J_sur = Local_Inference(Voters_By_County, M_sur, J, Ethnicity_Marginals, Party_Marginals, all_counties, best_q, best_l)
kl, std = KL(J, J_sur, all_counties, save_to_file=False, compute_abs_err=True)

Plot correlation with ground truth

In [None]:
j_sur = []
for c in all_counties:
    j_sur.append(np.array(J_sur[c]).flatten())

j_sur = np.array(j_sur).flatten()

plt.figure()
plt.scatter(j_true, j_sur, alpha=0.5)
plt.xlabel('Ground truth')
plt.ylabel('TROT (survey)')
plt.plot(diag, diag, 'r--')

plt.show()
    

Plot error distribution (compared with Florida average)

In [None]:
plt.figure()
bins = np.arange(-.3, .6, 0.01)
plt.hist(j_true - j_sur, bins=bins, alpha=0.5, label='TROT (survey)')
plt.hist(j_true - j_baseline, bins=bins, alpha=0.5, label='Florida-average')
plt.legend()
plt.xlabel('Difference between inference and ground truth')

plt.show()