# File description:

This file computes the log odds ratios with Dirichlet priors comparing word frequency across patient insurance type (low-income insurance vs other). Low-income insurance patient notes are compared to other-insurance patient notes.

The log odds ratios are computed using lists of all words used in the notes for patients of each insurance type. These lists are computed using the Create_Word_Lists file

In [7]:
### Load required packages ###
import pickle
import csv
import argparse
import logging
import math
import numpy as np
import operator
from typing import Iterator
from collections import Counter
from collections import Counter, OrderedDict

## Load lists of words from pickle files

### Load lists of MEDICAL words

In [8]:
# Load BROADBAND lists
medical_words_lowincome_broadband = pickle.load(open('Path_to_list', 'rb'))
medical_words_nonlowincome_broadband = pickle.load(open('Path_to_list', 'rb'))

In [9]:
# Load Thoracic lists
medical_words_lowincome_thor = pickle.load(open('Path_to_list', 'rb'))
medical_words_nonlowincome_thor = pickle.load(open('Path_to_list', 'rb'))

In [10]:
# Combine the lists of words into one large list for non-low-income patients and one for low-income patients
medical_words_lowincome = medical_words_lowincome_broadband + medical_words_lowincome_thor
medical_words_nonlowincome = medical_words_nonlowincome_thor + medical_words_nonlowincome_broadband

### Load lists of NON-MEDICAL words

In [11]:
# Load BROADBAND lists
nonmedical_words_lowincome_broadband = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_nonlowincome_broadband = pickle.load(open('Path_to_list', 'rb'))

In [12]:
# Load Thoracic lists
nonmedical_words_lowincome_thor = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_nonlowincome_thor = pickle.load(open('Path_to_list', 'rb'))

In [13]:
# Combine the lists of words into one large list for non-low-income patients and one for low-income patients
nonmedical_words_lowincome = nonmedical_words_lowincome_broadband + nonmedical_words_lowincome_thor
nonmedical_words_nonlowincome = nonmedical_words_nonlowincome_thor + nonmedical_words_nonlowincome_broadband

## Run log-odds ratio with Dirichlet prior

In [14]:
### The following code computes the Log Odds Ratio Informative Dirichlet Prior ###
def _size(corpus: dict) -> int:              #This counts the total number of words (including all repetitions)    
    return sum(corpus.values())

In [15]:
def _z_score(                              
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds / math.sqrt(variance)



In [16]:
def _log_odds(                             
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds


In [17]:
def _log_odds_CI(                              
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return [round(raw_logodds - 1.96*math.sqrt(variance),3), round(raw_logodds + 1.96*math.sqrt(variance),3)]

# 0: Create background corpus

The dirichlet prior will shrink odds ratios toward their "global values" from the background corpus. Therefore, it's important to ensure that each insurance type (low-income/non-low-income) is adequately represented in the background corpus. We will use all medical and nonmedical words from both insurance types as the background corpus.

See Monroe paper: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

In [22]:
# Import required packages
from random import choices
from collections import Counter, OrderedDict

#Note: Since there were more words in the non-low-income lists than the low-income lists, oversample words from the low-income lists
c_bg = dict(sorted(Counter(medical_words_nonlowincome + nonmedical_words_nonlowincome + choices(medical_words_lowincome + nonmedical_words_lowincome, k = len(medical_words_nonlowincome + nonmedical_words_nonlowincome))).items(), key=lambda x: x[1], reverse=True))

# **1a:** Compute log-odds ratios for medical and non-medical words combined

### Compare low-income-insurance vs. non-low-income-insurance
A ratio > 1 means the odds of the word occurring for a low-income-insurance patient is greater than the odds of the word occurring for a non-low-income-insurance patient.

In [23]:
c_1 = dict(sorted(Counter(medical_words_lowincome + nonmedical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_words_nonlowincome + nonmedical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [24]:
ratios = []
for tok in supported_tokens:
    rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
    ratios.append((tok, rat))

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [25]:
# The following words had higher odds of occurring for low-income-insurance patients
ratios[:50]

[('slurring', 0.47707027113547795),
 ('coarctation', 0.4686213282709968),
 ('osteosclerosis', 0.46701986144724117),
 ('decannulation', 0.4600517871263037),
 ('condylomata', 0.45955490124038967),
 ('jain', 0.4587994578660588),
 ('feline', 0.4582782300717483),
 ('implanon', 0.45822788413540394),
 ('pest', 0.45669710847180056),
 ('hamate', 0.4564828368349243),
 ('immunizations', 0.4558469130255993),
 ('syria', 0.45565837774450735),
 ('duodenectomy', 0.45459809337672397),
 ('risperidal', 0.4545977852217167),
 ('kenyan', 0.45322685457397327),
 ('deferasirox', 0.4525974873689549),
 ('zometa', 0.4525065464055853),
 ('subutex', 0.45233336864082574),
 ('axid', 0.452048566950058),
 ('lescol', 0.4518930950765494),
 ('ixabepilone', 0.4512051429337074),
 ('epithelioma', 0.45033381105341164),
 ('exponential', 0.44988113191735124),
 ('nizatidine', 0.4496303170595173),
 ('haart', 0.44955225053637626),
 ('informatic', 0.4493203354745354),
 ('eagle', 0.446921208528023),
 ('anchoring', 0.4460871750993380

In [26]:
# The following words had higher odds of occurring for non-low-income-insurance patients
ratios[len(ratios)-50: len(ratios)]

[('genu', -0.3121622620716469),
 ('hypervascular', -0.31339647472321097),
 ('pnh', -0.3135151412432524),
 ('iliacus', -0.3136068302276982),
 ('complexion', -0.313757805660428),
 ('adenosine', -0.3138587255345655),
 ('century', -0.3140578100379443),
 ('lumen', -0.3140580368076922),
 ('subcapsular', -0.3145317508146288),
 ('ivp', -0.31455872224767667),
 ('marathon', -0.31470543389156447),
 ('kuwait', -0.3157402430982401),
 ('retrosternal', -0.3159650335480002),
 ('thrombophilia', -0.31605384836298356),
 ('hypogammaglobulinemia', -0.31633814003891025),
 ('decortication', -0.31645570231440523),
 ('ureterectomy', -0.3165129152241146),
 ('scoring', -0.3166062776484022),
 ('angiomyolipoma', -0.31724325943211795),
 ('methylcellulose', -0.3174000439805855),
 ('chicago', -0.3174992459013364),
 ('arthrosis', -0.3175934337978781),
 ('chb', -0.3178387681463235),
 ('cystoprostatectomy', -0.3180855887807752),
 ('chondromalacia', -0.3187750599165291),
 ('electrocardiogram', -0.319096150195298),
 ('ser

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# **1b:** Compute z-scores for medical and non-medical words combined

### Compare low-income-insurance vs. non-low-income-insurance
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [27]:
c_1 = dict(sorted(Counter(medical_words_lowincome + nonmedical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_words_nonlowincome + nonmedical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [28]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [29]:
# The following words had higher odds of occurring for low-income-insurance patients
scores[:50]

[('she', 34.83775763932472),
 ('pt', 26.29719201070946),
 ('her', 25.434937236810395),
 ('pain', 22.109552545264197),
 ('brain', 21.397244889628656),
 ('axitinib', 21.31956964935049),
 ('fibromyalgia', 20.893193955686225),
 ('ie', 20.743793157575627),
 ('avelumab', 19.174588133873012),
 ('methadone', 19.038897366029218),
 ('disorder', 18.508099092522876),
 ('bipolar', 17.836066355859693),
 ('cabozantinib', 17.482693710124227),
 ('immunodeficiency', 17.14861406001928),
 ('docetaxel', 17.010619464785645),
 ('folfirinox', 16.51013325341809),
 ('pharmacy', 16.488481908199688),
 ('pontine', 16.021198371040267),
 ('oxycodone', 15.880819249672887),
 ('pertuzumab', 15.678057711894372),
 ('breast', 15.541109980926485),
 ('lung', 15.144235982030823),
 ('goal', 14.503856035298863),
 ('outside', 14.463667850406166),
 ('shoulder', 14.318263981357667),
 ('haart', 14.288457039161832),
 ('blast', 14.263096885176873),
 ('transaminitis', 14.25393241216663),
 ('cocaine', 14.14441882181075),
 ('depakote',

In [30]:
# The following words had higher odds of occurring for non-low-income-insurance patients
scores[len(scores)-50: len(scores)]

[('casodex', -10.728513073988928),
 ('scan', -10.749927751277221),
 ('lymphoma', -10.74997960341594),
 ('urgency', -10.869252592996533),
 ('procedure', -10.878817181384047),
 ('frequency', -10.955755625888484),
 ('perineural', -11.079231724913097),
 ('scalp', -11.152268608972674),
 ('variant', -11.160692544586906),
 ('image', -11.284993740141097),
 ('adaptive', -11.521206832009051),
 ('nasal', -11.55868711628875),
 ('aortic', -11.61424166650554),
 ('lymphocytic', -11.665422242156309),
 ('glioblastoma', -11.673264802350953),
 ('apalutamide', -11.773969060349213),
 ('cll', -11.823941306455279),
 ('mesothelioma', -11.84616265972496),
 ('flap', -11.954041289745541),
 ('margin', -12.043988169244344),
 ('avastin', -12.174474606442457),
 ('hot', -12.19784890661794),
 ('adenocarcinoma', -12.231563843021465),
 ('neck', -12.648636203661319),
 ('nocturia', -12.755093681345777),
 ('femoral', -12.911354424046994),
 ('normal', -12.952018548780913),
 ('treatment', -13.153549577871669),
 ('marry', -13

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 1c: Compute the 95% confidence intervals for the log odds ratios for medical and nonmedical words combined

In [31]:
c_1 = dict(sorted(Counter(medical_words_lowincome + nonmedical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_words_nonlowincome + nonmedical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [32]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))


CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [33]:
# The following had higher odds of occurring for low-income patient
CIs_sorted[:50]

[('she', [0.035, 0.039]),
 ('pt', [0.091, 0.106]),
 ('her', [0.039, 0.045]),
 ('pain', [0.027, 0.032]),
 ('brain', [0.058, 0.07]),
 ('axitinib', [0.28, 0.336]),
 ('fibromyalgia', [0.327, 0.395]),
 ('ie', [0.286, 0.346]),
 ('avelumab', [0.324, 0.397]),
 ('methadone', [0.233, 0.287]),
 ('disorder', [0.078, 0.097]),
 ('bipolar', [0.276, 0.344]),
 ('cabozantinib', [0.18, 0.226]),
 ('immunodeficiency', [0.349, 0.439]),
 ('docetaxel', [0.131, 0.166]),
 ('folfirinox', [0.135, 0.171]),
 ('pharmacy', [0.059, 0.075]),
 ('pontine', [0.358, 0.458]),
 ('oxycodone', [0.078, 0.1]),
 ('pertuzumab', [0.244, 0.314]),
 ('breast', [0.032, 0.041]),
 ('lung', [0.031, 0.04]),
 ('goal', [0.113, 0.148]),
 ('outside', [0.047, 0.062]),
 ('shoulder', [0.065, 0.086]),
 ('haart', [0.388, 0.511]),
 ('blast', [0.139, 0.183]),
 ('transaminitis', [0.226, 0.298]),
 ('cocaine', [0.35, 0.463]),
 ('depakote', [0.319, 0.423]),
 ('meningioma', [0.157, 0.208]),
 ('chronic', [0.05, 0.066]),
 ('vaginal', [0.09, 0.12]),
 ('depre

In [34]:
# The following had higher odds of occurring for non-low-income patient
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

[('casodex', [-0.144, -0.099]),
 ('scan', [-0.039, -0.027]),
 ('lymphoma', [-0.099, -0.068]),
 ('urgency', [-0.103, -0.072]),
 ('procedure', [-0.051, -0.036]),
 ('frequency', [-0.067, -0.046]),
 ('perineural', [-0.154, -0.107]),
 ('scalp', [-0.129, -0.09]),
 ('variant', [-0.188, -0.132]),
 ('image', [-0.052, -0.037]),
 ('adaptive', [-0.224, -0.159]),
 ('nasal', [-0.096, -0.068]),
 ('aortic', [-0.137, -0.097]),
 ('lymphocytic', [-0.295, -0.21]),
 ('glioblastoma', [-0.229, -0.163]),
 ('apalutamide', [-0.355, -0.253]),
 ('cll', [-0.287, -0.205]),
 ('mesothelioma', [-0.338, -0.242]),
 ('flap', [-0.222, -0.159]),
 ('margin', [-0.071, -0.051]),
 ('avastin', [-0.203, -0.147]),
 ('hot', [-0.101, -0.073]),
 ('adenocarcinoma', [-0.055, -0.04]),
 ('neck', [-0.041, -0.03]),
 ('nocturia', [-0.132, -0.097]),
 ('femoral', [-0.18, -0.133]),
 ('normal', [-0.033, -0.025]),
 ('treatment', [-0.035, -0.026]),
 ('marry', [-0.129, -0.095]),
 ('read', [-0.107, -0.079]),
 ('urinary', [-0.066, -0.049]),
 ('radi

In [35]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# **2a:** Compute log OR for medical words

### Redefine background corpus to include only medical words

In [36]:
c_bg = dict(sorted(Counter(medical_words_nonlowincome + choices(medical_words_lowincome, k = len(medical_words_nonlowincome))).items(), key=lambda x: x[1], reverse=True))
size3 = _size(c_bg)

### Compare for low-income-insurance vs. non-low-income-insurance

In [37]:
c_1 = dict(sorted(Counter(medical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [38]:
# The following medical words had higher odds of occurring for low-income patients
ratios[:50]

[('quixin', 0.7547767493038471),
 ('consulted', 0.6369937210782481),
 ('oligouria', 0.5724552073714602),
 ('tocopherol', 0.5724552073714602),
 ('astaxanthin', 0.54998241387724),
 ('mhc', 0.5316332826398327),
 ('panacinar', 0.5316332202819893),
 ('bezlotoxumab', 0.5316332202819893),
 ('triclosan', 0.5316332202819893),
 ('heaves', 0.5316332202819893),
 ('pricing', 0.5316332202819893),
 ('lda', 0.5316332202819893),
 ('cephalgia', 0.51636581793983),
 ('synergist', 0.5034624205347118),
 ('aortogram', 0.50346235074608),
 ('immunoblot', 0.50346235074608),
 ('basophilia', 0.50346235074608),
 ('hygromas', 0.50346235074608),
 ('cent', 0.50346235074608),
 ('cupping', 0.50346235074608),
 ('neurotomy', 0.50346235074608),
 ('lorcaserin', 0.50346235074608),
 ('catheterisation', 0.50346235074608),
 ('nailing', 0.50346235074608),
 ('ovide', 0.50346235074608),
 ('uvuloplasty', 0.49591529211877017),
 ('inspir', 0.49591529211877017),
 ('succinylcholine', 0.49591529211877017),
 ('aneuploid', 0.492412591778

In [39]:
# The following medical words had higher odds of occurring for non-low-income patients
ratios[len(ratios)-50: len(ratios)]

[('pharyngectomy', -0.3438372445546207),
 ('lotrel', -0.34383728592439056),
 ('naris', -0.34383731695172415),
 ('ceritinib', -0.34383732729416927),
 ('apresoline', -0.34383732729416927),
 ('mononucleosis', -0.34383735832151174),
 ('snare', -0.3438374307186649),
 ('ulcerating', -0.34383744106111713),
 ('necitumumab', -0.3438374824309349),
 ('pinguecula', -0.3438374927733907),
 ('avapro', -0.3438374927733907),
 ('retropubic', -0.34383753414322094),
 ('glypican', -0.34383758585552116),
 ('adenosarcoma', -0.34383763756783914),
 ('epoch', -0.34383772030757953),
 ('neurosarcoidosis', -0.3438377720199366),
 ('dutasteride', -0.3438377823624119),
 ('hyzaar', -0.34383779270488546),
 ('mtc', -0.3438378547597427),
 ('tambocor', -0.34383792715710726),
 ('sotalol', -0.3438379478420739),
 ('momelotinib', -0.3438379581845581),
 ('paricalcitol', -0.34383798921201425),
 ('lomustine', -0.34383798921201425),
 ('tremelimumab', -0.3438380305819635),
 ('cs', -0.34383809263690956),
 ('arginase', -0.3438382581

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 2b: Compute z-scores for medical words

### Compare low-income-insurance vs. non-low-income-insurance
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [40]:
c_1 = dict(sorted(Counter(medical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [41]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [42]:
# words with z-scores > 1.96 had higher odds of occuring for low-income insurance patients at 5% level of significance
scores[:50]

[('pt', 26.131029201545427),
 ('pain', 22.63095678225238),
 ('brain', 21.524038665666207),
 ('axitinib', 21.260563782196243),
 ('fibromyalgia', 20.936965557059395),
 ('ie', 20.42303179287087),
 ('avelumab', 19.35384588642298),
 ('methadone', 19.13494035369599),
 ('disorder', 18.494215297168523),
 ('bipolar', 17.873754718031464),
 ('cabozantinib', 17.258987446694114),
 ('immunodeficiency', 17.112276518891328),
 ('docetaxel', 16.98925077326428),
 ('folfirinox', 16.376720694355715),
 ('pontine', 16.12817323412844),
 ('oxycodone', 15.951542292362255),
 ('pertuzumab', 15.821089973605382),
 ('breast', 15.511475394466924),
 ('lung', 15.081237276954958),
 ('outside', 14.825118617403085),
 ('haart', 14.407141271450591),
 ('transaminitis', 14.266973743325448),
 ('cocaine', 14.241682556480438),
 ('shoulder', 14.130623281493135),
 ('depakote', 14.009657291523467),
 ('meningioma', 13.752227422710746),
 ('axillary', 13.602751192104543),
 ('depression', 13.488157206432163),
 ('opioid', 12.72735262647

In [43]:
# words with z-scores < -1.96 had higher odds of occuring for non-low-income insurance patients at 5% level of significance
scores[len(scores)-50: len(scores)]

[('stomach', -8.674461682495764),
 ('mid', -8.79295837226487),
 ('thigh', -8.8930146245618),
 ('rp', -8.903753673807138),
 ('aspirin', -9.09089552442337),
 ('cetirizine', -9.152540298642663),
 ('bcc', -9.269568900353963),
 ('stricture', -9.368533456813775),
 ('hyperplasia', -9.52536160353449),
 ('carcinoma', -9.613112354465521),
 ('zyrtec', -9.689816402739037),
 ('lambda', -9.866086215227424),
 ('tumor', -9.937075108703503),
 ('salvage', -9.96030800247736),
 ('crizotinib', -9.98440947632699),
 ('desensitization', -10.142849545600534),
 ('melanoma', -10.20797388931448),
 ('focal', -10.255048045478537),
 ('nivolumab', -10.60741837565935),
 ('lymphoma', -10.613871818570292),
 ('scan', -10.732791870805576),
 ('casodex', -10.832346517187984),
 ('scalp', -10.91185704240561),
 ('skin', -10.920510039337247),
 ('perineural', -10.921073712176968),
 ('mesothelioma', -11.669714310187343),
 ('glioblastoma', -11.669757739901364),
 ('nasal', -11.803396811957041),
 ('apalutamide', -11.813436080329097)

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 2c: Compute 95% confidence intervals for the log odds ratios for medical words

In [44]:
c_1 = dict(sorted(Counter(medical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [46]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))


CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [47]:
# The following had higher odds of occurring for low-income patient
CIs_sorted[:50]

[('pt', [0.091, 0.106]),
 ('pain', [0.027, 0.033]),
 ('brain', [0.058, 0.07]),
 ('axitinib', [0.279, 0.336]),
 ('fibromyalgia', [0.328, 0.395]),
 ('ie', [0.285, 0.346]),
 ('avelumab', [0.324, 0.397]),
 ('methadone', [0.234, 0.287]),
 ('disorder', [0.078, 0.097]),
 ('bipolar', [0.276, 0.344]),
 ('cabozantinib', [0.179, 0.224]),
 ('immunodeficiency', [0.349, 0.44]),
 ('docetaxel', [0.131, 0.166]),
 ('folfirinox', [0.134, 0.17]),
 ('pontine', [0.357, 0.456]),
 ('oxycodone', [0.078, 0.1]),
 ('pertuzumab', [0.245, 0.315]),
 ('breast', [0.032, 0.041]),
 ('lung', [0.031, 0.04]),
 ('outside', [0.049, 0.063]),
 ('haart', [0.386, 0.508]),
 ('transaminitis', [0.226, 0.298]),
 ('cocaine', [0.35, 0.462]),
 ('shoulder', [0.064, 0.085]),
 ('depakote', [0.319, 0.423]),
 ('meningioma', [0.154, 0.205]),
 ('axillary', [0.058, 0.078]),
 ('depression', [0.062, 0.083]),
 ('opioid', [0.128, 0.174]),
 ('virus', [0.191, 0.261]),
 ('mediastinoscopy', [0.129, 0.178]),
 ('gluteus', [0.254, 0.352]),
 ('anxiety', [

In [48]:
# The following had higher odds of occurring for non-low-income patient
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

[('stomach', [-0.11, -0.07]),
 ('mid', [-0.069, -0.044]),
 ('thigh', [-0.099, -0.064]),
 ('rp', [-0.143, -0.091]),
 ('aspirin', [-0.083, -0.054]),
 ('cetirizine', [-0.207, -0.134]),
 ('bcc', [-0.229, -0.149]),
 ('stricture', [-0.226, -0.148]),
 ('hyperplasia', [-0.148, -0.097]),
 ('carcinoma', [-0.037, -0.024]),
 ('zyrtec', [-0.203, -0.135]),
 ('lambda', [-0.29, -0.194]),
 ('tumor', [-0.037, -0.025]),
 ('salvage', [-0.179, -0.12]),
 ('crizotinib', [-0.329, -0.221]),
 ('desensitization', [-0.385, -0.26]),
 ('melanoma', [-0.084, -0.057]),
 ('focal', [-0.064, -0.043]),
 ('nivolumab', [-0.111, -0.076]),
 ('lymphoma', [-0.097, -0.067]),
 ('scan', [-0.039, -0.027]),
 ('casodex', [-0.145, -0.101]),
 ('scalp', [-0.126, -0.088]),
 ('skin', [-0.032, -0.022]),
 ('perineural', [-0.151, -0.105]),
 ('mesothelioma', [-0.332, -0.237]),
 ('glioblastoma', [-0.229, -0.163]),
 ('nasal', [-0.098, -0.07]),
 ('apalutamide', [-0.356, -0.255]),
 ('cll', [-0.288, -0.206]),
 ('adenocarcinoma', [-0.054, -0.039]),

In [49]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# **3a:** Compute log OR for nonmedical words

### Redefine background corpus to only include nonmedical words

In [50]:
c_bg = dict(sorted(Counter(nonmedical_words_nonlowincome + choices(nonmedical_words_lowincome, k = len(nonmedical_words_nonlowincome))).items(), key=lambda x: x[1], reverse=True))
size3 = _size(c_bg)

### Compare nonmedical words for low-income vs. non-low-income

In [51]:
c_1 = dict(sorted(Counter(nonmedical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [52]:
# The following nonmedical words had higher odds of occurring for low-income patients
ratios[:50]

[('teens', 0.7546333768100837),
 ('ipecac', 0.6368503479596654),
 ('taught', 0.6368503479596654),
 ('interfered', 0.5723118336280564),
 ('groups', 0.5723118336280564),
 ('gross', 0.5723118336280564),
 ('clubhouse', 0.5498390348983779),
 ('coparent', 0.5498390348983779),
 ('palms', 0.5314899030361477),
 ('performer', 0.531489845913768),
 ('planes', 0.531489845913768),
 ('cognac', 0.523521854437746),
 ('blocking', 0.5033191036097566),
 ('quarrel', 0.5033189757530341),
 ('panacea', 0.5033189757530341),
 ('chive', 0.5033189757530341),
 ('underemployment', 0.5033189757530341),
 ('msq', 0.5033189757530341),
 ('kratom', 0.5033189757530341),
 ('sphinx', 0.5033189757530341),
 ('hyperpigmented', 0.49226921030077087),
 ('malpractice', 0.48893037976959697),
 ('compleat', 0.48893037976959697),
 ('mechanics', 0.48574418460333924),
 ('oldwoman', 0.48269976609058673),
 ('stepsister', 0.48269976609058673),
 ('soldier', 0.4826996953562652),
 ('nutcracker', 0.4826996953562652),
 ('deportation', 0.4826996

In [53]:
# The following nonmedical words had higher odds of occurring for non-low-income patients 
ratios[len(ratios)-50: len(ratios)]

[('racquetball', -0.34397945103615335),
 ('brake', -0.3439794605168025),
 ('obstetrician', -0.3439794794781026),
 ('prophase', -0.3439794889587535),
 ('cafe', -0.3439794889587535),
 ('pagan', -0.34397951740070987),
 ('osteopath', -0.34397954584266976),
 ('photography', -0.34397954584266976),
 ('sow', -0.34397955532332425),
 ('grass', -0.34397955532332425),
 ('submarine', -0.34397956480397873),
 ('competitive', -0.3439795837652895),
 ('mango', -0.3439795837652895),
 ('louisiana', -0.3439795837652895),
 ('uncaria', -0.34397962168791985),
 ('periodontist', -0.3439796311685779),
 ('collagenous', -0.3439796501298975),
 ('lymphangiogram', -0.3439796501298975),
 ('sir', -0.3439796880525403),
 ('manufacturing', -0.34397970701386527),
 ('pellet', -0.34397971649452863),
 ('pilocytic', -0.3439797259751938),
 ('wisconsin', -0.34397974493652406),
 ('latvia', -0.34397974493652406),
 ('venezuela', -0.3439797544171892),
 ('compressible', -0.34397976389785434),
 ('scotch', -0.3439797923398551),
 ('wall

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 3b: Compute z-scores for nonmedical words

In [54]:
c_1 = dict(sorted(Counter(nonmedical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

scores = []

for tok in supported_tokens:
    try:
        score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
        scores.append((tok, score))
    except:
        print(tok)

scores.sort(key=operator.itemgetter(1), reverse=True)

In [55]:
# words with z-scores > 1.96 had higher odds of occuring for low-income insurance patients at 5% level of significance
scores[:50]

[('she', 35.33183693395091),
 ('her', 25.306341575806265),
 ('pharmacy', 16.37677993871482),
 ('blast', 14.327420859048829),
 ('goal', 14.038604170859022),
 ('sister', 13.870593494604451),
 ('vaginal', 13.75926596640333),
 ('chronic', 13.727551830560754),
 ('human', 12.679326643579254),
 ('hour', 12.34895733052445),
 ('insurance', 12.028551975540404),
 ('access', 11.983841920301966),
 ('help', 11.571302982745022),
 ('intrapelvic', 11.366389788538964),
 ('intrathecal', 11.024469776192877),
 ('interpreter', 10.867635524857128),
 ('viral', 10.78217949684937),
 ('ox', 10.650653648910522),
 ('like', 10.64060070719339),
 ('housing', 10.638174299188245),
 ('secondary', 10.61911932502813),
 ('divorce', 10.613479630660986),
 ('provide', 10.268577549464998),
 ('peptic', 10.185858890145337),
 ('partial', 10.015121518702117),
 ('hospital', 10.000228175060919),
 ('polypoid', 9.853228568051609),
 ('ideation', 9.840965545269244),
 ('myeloid', 9.621261604631155),
 ('refill', 9.578903604938327),
 ('ind

In [56]:
# words with z-scores < -1.96 had higher odds of occuring for non-low-income insurance patients at 5% level of significance
scores[len(scores)-50: len(scores)]

[('motion', -7.486094858165483),
 ('set', -7.51342020856957),
 ('assay', -7.617202005168795),
 ('form', -7.650294872783311),
 ('medium', -7.716088558778836),
 ('microwave', -7.8006895143749455),
 ('replacement', -7.869752195084501),
 ('golf', -7.9469470772197095),
 ('undifferentiated', -7.958935468523697),
 ('medical', -8.014625529090498),
 ('calcify', -8.056620867550425),
 ('well', -8.083973082648814),
 ('actinic', -8.101424650798883),
 ('toxicity', -8.2634032646828),
 ('hormonal', -8.386429301517506),
 ('often', -8.467885739347656),
 ('date', -8.546821526491604),
 ('body', -8.568114691321718),
 ('direction', -8.571377778216421),
 ('myxoid', -8.58937154575766),
 ('serous', -9.008205398591832),
 ('analysis', -9.177753847216248),
 ('stenosis', -9.216180681857061),
 ('mild', -9.257567600517287),
 ('male', -9.616427078087249),
 ('husband', -9.925070381277916),
 ('adjuvant', -9.963158009254276),
 ('pathology', -10.29858573820105),
 ('urgency', -10.551943906449912),
 ('benign', -10.69972782

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 3c: Compute 95% confidence intervals for the log odds ratios for nonmedical words

In [57]:
c_1 = dict(sorted(Counter(nonmedical_words_lowincome).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_words_nonlowincome).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))

In [58]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [59]:
# The following had higher odds of occurring for low-income patient
CIs_sorted[:50]

[('she', [0.035, 0.039]),
 ('her', [0.038, 0.045]),
 ('pharmacy', [0.058, 0.074]),
 ('blast', [0.14, 0.184]),
 ('goal', [0.11, 0.145]),
 ('sister', [0.096, 0.128]),
 ('vaginal', [0.09, 0.121]),
 ('chronic', [0.049, 0.066]),
 ('human', [0.224, 0.306]),
 ('hour', [0.035, 0.049]),
 ('insurance', [0.131, 0.182]),
 ('access', [0.147, 0.205]),
 ('help', [0.053, 0.075]),
 ('intrapelvic', [0.34, 0.482]),
 ('intrathecal', [0.247, 0.354]),
 ('interpreter', [0.252, 0.363]),
 ('viral', [0.13, 0.188]),
 ('ox', [0.132, 0.192]),
 ('like', [0.049, 0.071]),
 ('housing', [0.265, 0.384]),
 ('secondary', [0.061, 0.089]),
 ('divorce', [0.143, 0.207]),
 ('provide', [0.058, 0.086]),
 ('peptic', [0.228, 0.336]),
 ('partial', [0.057, 0.085]),
 ('hospital', [0.038, 0.056]),
 ('polypoid', [0.184, 0.276]),
 ('ideation', [0.118, 0.177]),
 ('myeloid', [0.14, 0.212]),
 ('refill', [0.052, 0.079]),
 ('indicator', [0.193, 0.293]),
 ('inpatient', [0.126, 0.193]),
 ('graduate', [0.204, 0.312]),
 ('friend', [0.087, 0.134]

In [60]:
# The following had higher odds of occurring for non-low-income patients
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

[('motion', [-0.057, -0.033]),
 ('set', [-0.046, -0.027]),
 ('assay', [-0.295, -0.174]),
 ('form', [-0.06, -0.035]),
 ('medium', [-0.093, -0.055]),
 ('microwave', [-0.418, -0.25]),
 ('replacement', [-0.105, -0.063]),
 ('golf', [-0.241, -0.146]),
 ('undifferentiated', [-0.357, -0.216]),
 ('medical', [-0.025, -0.015]),
 ('calcify', [-0.218, -0.133]),
 ('well', [-0.017, -0.01]),
 ('actinic', [-0.284, -0.174]),
 ('toxicity', [-0.058, -0.036]),
 ('hormonal', [-0.153, -0.095]),
 ('often', [-0.105, -0.066]),
 ('date', [-0.027, -0.017]),
 ('body', [-0.053, -0.033]),
 ('direction', [-0.107, -0.067]),
 ('myxoid', [-0.355, -0.223]),
 ('serous', [-0.221, -0.142]),
 ('analysis', [-0.224, -0.145]),
 ('stenosis', [-0.117, -0.076]),
 ('mild', [-0.035, -0.023]),
 ('male', [-0.055, -0.036]),
 ('husband', [-0.088, -0.059]),
 ('adjuvant', [-0.075, -0.051]),
 ('pathology', [-0.054, -0.037]),
 ('urgency', [-0.101, -0.069]),
 ('benign', [-0.083, -0.057]),
 ('available', [-0.08, -0.055]),
 ('procedure', [-0.0

In [61]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])