# Robotic Survey Responses
A simmple application to generate _real looking_ survey response data for testing and demonstration purposes.

### Note
Generating millions of `Respondent` objects takes a long time. The task is easy to distribute between multiple compites. When using really large datasets, I recommend breaking into chunks. 

### Requires
* Pandas
* Numpy
* Altair

### Usage: 
1. load a demographic dataset that your _respondents_ will emulate. 
2. create a `groupby` dataset that has the breakdown of responses that you want. 
2. generate a list of respondents
3. Give each survey respondent a question

### Apply biased to moves

you need to define the `bias` and the `bias_amount` as shown here:
```
bias = {"Race/Ethnicity": "White", "Sex": "M"}
bias_amount = 0.3
```


In [1]:
import pandas as pd
import numpy as np
import altair as alt

import os
import getpass

print(getpass.getuser())

pd.set_option("display.max_columns", 10)
pd.set_option("display.max_rows", 10)
%load_ext lab_black

william.harding


For my baseline, I'm using [City of Seattle Employee Demographics](https://catalog.data.gov/dataset?q=seattle&sort=views_recent+desc&res_format=CSV&tags=demographics&as_sfid=AAAAAAWCMQU1RhRCAgBiIrVJm0Xy3JnWTC6g7qsigwrXpF4vPKcUbrVN_ebcPetlcNx5rgddKFH5g03n-59YZ-fl3FmTErVFZN4oeIkVs2SHbl-T8pMzONWM91kV8Mtdv_WrnDg%3D&as_fid=4c15d50fccbbc3feaff898fda1ca3723c0b00d7f&ext_location=&ext_bbox=&ext_prev_extent=-142.03125%2C8.754794702435618%2C-59.0625%2C61.77312286453146)

In [2]:
# reading in the survey data I want to mimic
df_demos = pd.read_csv(
    r"C:\Users\william.harding\Documents\Python Scripts\City_of_Seattle_Staff_Demographics.csv"
)
df_demos

Unnamed: 0,Race/Ethnicity,Sex,Department,Age,Hourly Rate,Regular/Temporary,Employee Status
0,Hispanic or Latino,F,Seattle Dept of Human Resource,50,45.3100,R,A
1,Hispanic or Latino,F,Parks & Recreation,23,18.2300,T,A
2,Hispanic or Latino,M,Seattle City Light,55,43.5500,R,A
3,Hispanic or Latino,M,Fire Department,47,52.6300,R,A
4,Hispanic or Latino,M,Police Department,48,59.5300,R,A
...,...,...,...,...,...,...,...
13638,Not Specified,M,Parks & Recreation,40,30.4800,R,A
13639,Not Specified,F,Legislative Department,24,31.1303,R,A
13640,Not Specified,M,Seattle Dept of Transportation,21,17.8400,T,A
13641,Not Specified,M,Seattle City Light,38,29.5200,R,A


I want the demographics of my survey respondents to have the same breakdown as I have in my dataset. 

In [3]:
def make_dist_table(df_demos, demo_cols, count_col):
    demos = df_demos.groupby(demo_cols).count()[count_col].reset_index()
    demos["dist"] = demos[count_col] / demos[count_col].sum()
    demos = demos.rename(columns={count_col: "count"}).sort_values(
        by="dist", ascending=False
    )
    demos["cumsum"] = demos["dist"].cumsum()
    return demos.reset_index(drop=True)


demos = make_dist_table(df_demos, ["Race/Ethnicity", "Sex"], "Employee Status")
demos

Unnamed: 0,Race/Ethnicity,Sex,count,dist,cumsum
0,White,M,5004,0.366781,0.366781
1,White,F,2814,0.206260,0.573041
2,Asian,M,1128,0.082680,0.655721
3,Asian,F,982,0.071978,0.727699
4,Black or African American,M,972,0.071245,0.798945
...,...,...,...,...,...
16,Asian,N,3,0.000220,0.999487
17,White,N,3,0.000220,0.999707
18,Not Specified,N,2,0.000147,0.999853
19,Two or More Races,N,1,0.000073,0.999927


In [4]:
baseline = (
    alt.Chart(demos)
    .mark_bar()
    .encode(x="Sex", y="count:Q", column="Race/Ethnicity:N")
    .properties(width=75, title="this is what the datafile looks like.")
)
baseline

In [5]:
class Respondent:
    def __init__(self, demos):
        self.pop_demos = demos
        self.demos = self.chose_demos()

    def __repr__(self):
        return f"respondent: {[i for i in self.demos]}"

    def dieroll(self):
        return np.random.uniform(0, 1)

    def chose_demos(self):
        # getting the index of randomly chosed demo
        choice = max(
            [0] + [i + 1 for i in demos[demos["cumsum"] <= self.dieroll()].index]
        )
        out = self.pop_demos.loc[choice].drop(["dist", "cumsum", "count"])
        return out

    def answer_question(self, scale, bias=None, bias_amount=0):
        rnd = np.random.normal(np.mean(scale))
        # only shift the bias if bias is not none.
        if bias:
            if self.is_biased(bias):
                rnd += bias_amount

        choices = [i + 1 for i in scale if i < rnd]
        if len(choices) == 0:
            return 1
        else:
            return max([i + 1 for i in scale if i < rnd])

    def is_biased(self, bias):
        check = [bias[e] == self.demos[e] for e in bias.keys()]
        b = all(check)
        return b


r = Respondent(demos)
r

respondent: ['Nat Hawaiian/Oth Pac Islander', 'M']

In [6]:
# choosing a random dataset for sampling.
# n = len(df_demos)
n = 1000

pop = [Respondent(demos) for n in range(n)]

In [7]:
pop[:10]

[respondent: ['White', 'M'],
 respondent: ['Asian', 'M'],
 respondent: ['White', 'M'],
 respondent: ['Asian', 'F'],
 respondent: ['White', 'F'],
 respondent: ['White', 'M'],
 respondent: ['White', 'F'],
 respondent: ['White', 'F'],
 respondent: ['White', 'F'],
 respondent: ['White', 'M']]

In [8]:
sample = pd.DataFrame([[i for i in r.demos] for r in pop]).rename(
    columns={0: "Race/Ethnicity", 1: "Sex"}
)
sample

Unnamed: 0,Race/Ethnicity,Sex
0,White,M
1,Asian,M
2,White,M
3,Asian,F
4,White,F
...,...,...
995,Asian,F
996,White,F
997,White,M
998,Hispanic or Latino,M


In [9]:
alt.Chart(sample).mark_bar().encode(
    alt.X("Sex"), y="count()", column="Race/Ethnicity"
).properties(
    width=75, title="Randomly generated, but distributed by demographics"
) & baseline.properties(
    title="Compared to the origional dataset"
)

Now have the surey respondents complete a *Likert* question for a specified range. 

In [10]:
sample["q1"] = [r.answer_question(range(5), 0) for r in pop]
sample

Unnamed: 0,Race/Ethnicity,Sex,q1
0,White,M,2
1,Asian,M,2
2,White,M,3
3,Asian,F,4
4,White,F,2
...,...,...,...
995,Asian,F,2
996,White,F,2
997,White,M,4
998,Hispanic or Latino,M,2


In [11]:
sample_chart = (
    alt.Chart(sample)
    .mark_area()
    .encode(alt.X("q1:Q", bin=True), y="count()",)
    .properties(title="The survey respondents follow a normal distribution")
)
sample_chart

However, in `q2` we are going to introduce a _bias_ such that some respondents will tendt to answer this question more positively. 

In [12]:
bias = {"Race/Ethnicity": "White", "Sex": "M"}
bias_amount = 1

In [13]:
r.is_biased(bias)

False

In [14]:
r.demos

Race/Ethnicity    Nat Hawaiian/Oth Pac Islander
Sex                                           M
Name: 11, dtype: object

In [15]:
sample["q2"] = [
    r.answer_question(range(5), bias=bias, bias_amount=bias_amount) for r in pop
]
sample

Unnamed: 0,Race/Ethnicity,Sex,q1,q2
0,White,M,2,4
1,Asian,M,2,1
2,White,M,3,3
3,Asian,F,4,2
4,White,F,2,2
...,...,...,...,...
995,Asian,F,2,2
996,White,F,2,1
997,White,M,4,5
998,Hispanic or Latino,M,2,1


In [20]:
sample[["q1", "q2"]].describe()

Unnamed: 0,q1,q2
count,1000.0,1000.0
mean,2.52,2.909
std,0.983141,1.077906
min,1.0,1.0
25%,2.0,2.0
50%,2.0,3.0
75%,3.0,4.0
max,5.0,5.0


In [17]:
sample_chart.mark_area(opacity=0.3) + sample_chart.mark_line().encode(
    alt.X("q2:Q", bin=True), tooltip="count()"
)

In [18]:
bias_compare = (
    sample_chart.mark_area(opacity=0.3)
    .encode(alt.X("q1:Q", bin=True), row="Sex", column="Race/Ethnicity")
    .properties(
        width=100,
        height=100,
        title="Here, all of the respondents follow a normal distribution.",
    )
)

bias_compare

In [19]:
bias_compare = (
    sample_chart.mark_area(opacity=0.3)
    .encode(alt.X("q2:Q", bin=True), row="Sex", column="Race/Ethnicity")
    .properties(
        width=100,
        height=100,
        title="you can see how, in this case, white males have a large bias for this question.",
    )
)

bias_compare