# Efficacy of Vocational Rehabilitation Services in America 

> Dataset: [Current Population Survey, July 2021: Disability Supplement](https://api.census.gov/data/2021/cps/disability/jul.html)
* The universe consists of all persons in the civilian non-institutional population of the United States living in households. 
* The probability sample selected to represent the universe consists of approximately 50,000 households.

### Imports

In [8]:
import requests
import pandas as pd
import numpy as np

### Data Dictionary
> Include information such as exact variable name in API, variable name in analysis, and variable name in visualization, description, measurement units, expected values, expected min/max


### Query Texas Records
> Querying individual records for **every** county in Texas

In [9]:
HOST = "https://api.census.gov/data"
year = "2021"
dataset = "cps/disability/jul"
base_url = "/".join([HOST, year, dataset])

predicates = {}
get_vars = ["PEMLR", 
            "PESD6A", 
            "PESD6B", 
            "PESD6C", 
            "PESD6D", 
            "PESD6E", 
            "PESD6F", 
            "PESD6G", 
            "PESD7A", 
            "PESD7B", 
            "PESD7C",
            "PESD7E", 
            "PESD7G", 
            "PRDISFLG", 
            "PESD41", 
            "PESD42", 
            "PESD43", 
            "PESD44", 
            "PESD45", 
            "PESD46", 
            "PESD47",
            "PESD48", 
            "PESD49",
            "PTDTRACE",
            "PESEX",
            "PRTAGE",
            "HEFAMINC"
            ]
predicates["get"] = ",".join(get_vars)
predicates["for"] = "county:*"
predicates["in"] = "state:48"

r = requests.get(base_url, params=predicates)

### Format Data Frame and turn to CSV

In [10]:
col_names = [
    "labor_force_employment_status",
    "used_vocational_rehabilitation_agencies",
    "used_one_stop_career_centers",
    "used_the_ticket_to_work_program",
    "used_assistive_technology_act_prog",
    "used_ctr_for_indpt_living_for_ind_w_dis",
    "used_the_client_assistance_program",
    "used_any_other_employment_assistance_program",
    "how_helpful_vocational_rehab_agency",
    "how_helpful_one_stop_career_centers",
    "the_ticket_to_work_program_helpfulness",
    "ctr_for_indpdt_living_for_ind_w_dis_helpful",
    "other_employment_assist_program_helpful",
    "does_this_person_have_any_of_these_disability_conditions",
    "barrier_lack_of_education_or_training",
    "barrier_lack_of_job_counseling",
    "barrier_lack_of_transportation",
    "barrier_loss_of_government_assistance",
    "barrier_need_for_special_features",
    "barrier_employer_or_coworker_attitudes",
    "barrier_your_difficulty_with_disability",
    "barrier_other",
    "barrier_none",
    "demographics_race_of_respondent",
    "demographics_sex",
    "demographics_age",
    "household_total_family_income_past_12_months",
    "state",
    "county"
]
df = pd.DataFrame(columns=col_names, data=r.json()[1:])
df.to_csv("raw-data.csv")

[['PEMLR', 'PESD6A', 'PESD6B', 'PESD6C', 'PESD6D', 'PESD6E', 'PESD6F', 'PESD6G', 'PESD7A', 'PESD7B', 'PESD7C', 'PESD7E', 'PESD7G', 'PRDISFLG', 'PESD41', 'PESD42', 'PESD43', 'PESD44', 'PESD45', 'PESD46', 'PESD47', 'PESD48', 'PESD49', 'PTDTRACE', 'PESEX', 'PRTAGE', 'HEFAMINC', 'state', 'county'], ['5', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '2', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '2', '2', '66', '7', '48', '139'], ['-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '2', '1', '10', '7', '48', '139'], ['-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '2', '2', '12', '7', '48', '139'], ['1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '2', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '1', '1', '33', '16', '48', '0'], ['1', '-1', '

### Clean Data and Correct Output

In [11]:
data = pd.read_csv('raw-data.csv',header=0)
# drop all records that are not in the universe of labor_force_employment_status
filteredData = data[data["labor_force_employment_status"] != -1]
filteredData = filteredData.loc[:,  ~filteredData.columns.str.contains('^Unnamed')]

# categorize output variable into binary options
filteredData["labor_force_employment_status"] = filteredData["labor_force_employment_status"].apply(lambda x: 1 if x == 1 or x == 2 else 0)

filteredData.to_csv("cleaned-data.csv")

### Find significant variables
> Note: Solving a classification problem via inference

#### Options:
- Logistic Regression - commonly used for classification problems
- Stepwise
- Decision Trees
- Random forrest
- Neural Network

### Logistic Regression
* [Documentation](https://pytorch.org/tutorials/beginner/nn_tutorial.html#neural-net-from-scratch-no-torch-nn)
* Train minimal neural network (logsitic regression, since there are no hidden layers)

#### Step 1: Turn df to tensors

In [12]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split

data = pd.read_csv('cleaned-data.csv',header=0)

# 80% training, 20% validation
x_var = data.loc[:, data.columns != "labor_force_employment_status"]
y_var = data.loc[:, data.columns == "labor_force_employment_status"]

x_train, x_valid, y_train, y_valid = train_test_split(
    x_var,
    y_var,
    test_size=0.2,
    random_state=42
)

# Convert DataFrames to NumPy arrays and then to tensors
x_train = torch.tensor(x_train.values, dtype=torch.float32)
x_valid = torch.tensor(x_valid.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_valid = torch.tensor(y_valid.values, dtype=torch.float32)
n, c = x_train.shape

#### Step 2: Create model

In [13]:
import math

weights = torch.randn(c, 2) / math.sqrt(c)
weights.requires_grad_()

bias = torch.zeros(2, requires_grad=True)

def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
    return log_softmax(xb @ weights + bias)

bs = 64  # batch size

xb = x_train[0:bs]  # a mini-batch from x
preds = model(xb)  # predictions

def nll(input, target):
    return -input[range(target.shape[0]), target].mean()

loss_func = nll

y_train = y_train.squeeze().long()  # Ensure it's 1D and long type for classification
yb = y_train[0:bs]

def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

print(accuracy(preds, yb))

lr = 0.5  # learning rate
epochs = 2  # how many epochs to train for

for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()

print(loss_func(model(xb), yb), accuracy(model(xb), yb))


tensor(0.4531)
tensor(nan, grad_fn=<NegBackward0>) tensor(0.2759)
