# File description:

This file computes the log odds ratios with Dirichlet priors comparing word frequency across patient race. Non-hispanic White patient notes are compared to Black patient notes, and all other notes are compared to White patient notes.

The log odds ratios are computed using lists of all words used in the notes for patients of each race. These lists are computed using the Create_Word_Lists file

In [None]:
# Load required packages 
import pickle
import csv
import argparse
import logging
import math
import numpy as np
import operator
from typing import Iterator
from collections import Counter
from collections import Counter, OrderedDict

## Load lists of words from pickle files

### Load lists of MEDICAL words

In [None]:
medical_words_Asian = pickle.load(open('Path_to_list', 'rb'))
medical_words_Hispanic = pickle.load(open('Path_to_list', 'rb'))
medical_words_Black = pickle.load(open('Path_to_list', 'rb'))
medical_words_Unknown = pickle.load(open('Path_to_list', 'rb'))
medical_words_Other = pickle.load(open('Path_to_list', 'rb'))
medical_words_White = pickle.load(open('Path_to_list', 'rb'))


### Load lists of NON-MEDICAL words

In [None]:
nonmedical_words_Asian = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_Hispanic = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_Black = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_Unknown = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_Other = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_White = pickle.load(open('Path_to_list', 'rb'))

## Run log-odds ratio with Dirichlet prior

In [None]:
def _size(corpus: dict) -> int:              #This counts the total number of words (including all repetitions)    
    return sum(corpus.values())

In [None]:
def _log_odds(                          
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds



In [None]:
def _z_score(                             
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:
    
    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds / math.sqrt(variance)



In [None]:
def _log_odds_CI(                             
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:
    
    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return [round(raw_logodds - 1.96*math.sqrt(variance),3), round(raw_logodds + 1.96*math.sqrt(variance),3)]

# 0: Create background corpus

The dirichlet prior will shrink odds ratios toward their global values from the background corpus. Therefore, it's important to ensure that each race is equally represented in the background corpus. We will oversample words from the non-White word lists to match the number of words in the White word lists.

For the medical word analysis, the background corpus will include only medical words. For the nonmedical word analysis, the background corpus will include only nonmedical words. For the all-words analysis, the background corpus will include both medical and nonmedical words

See Monroe paper: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

In [None]:
# Import required packages
from random import choices
from collections import Counter, OrderedDict

# Define the background corpus (c_bg) for the all words analysis
nonmedical_words_list = choices(nonmedical_Black , k = len(nonmedical_White)) + choices(nonmedical_Hispanic , k = len(nonmedical_White)) + choices(nonmedical_Asian , k = len(nonmedical_White)) + choices(nonmedical_Other , k = len(nonmedical_White)) + choices(nonmedical_Unknown , k = len(nonmedical_White)) + nonmedical_White
medical_words_list = choices(medical_Black , k = len(medical_White)) + choices(medical_Hispanic , k = len(medical_White)) + choices(medical_Asian , k = len(medical_White)) + choices(medical_Other , k = len(medical_White)) + choices(medical_Unknown , k = len(medical_White)) + medical_White
c_bg = dict(sorted(Counter(nonmedical_words_list + medical_words_list).items(), key=lambda x: x[1], reverse=True))

# **1a:** Compute log-odds ratios for medical and non-medical words combined

### Compare Black vs. White
A ratio > 1 means the odds of the word occurring for a Black patient is greater than the odds of the word occurring for a White patient.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Black , k = len(medical_White)) + choices(nonmedical_Black , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
ratios = []
for tok in supported_tokens:
    rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
    ratios.append((tok, rat))

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Black patients
ratios[:50]

In [None]:
# The following words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare Hispanic vs. White
A ratio > 1 means the odds of the word occurring for a Black patient is greater than the odds of the word occurring for a White patient.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Hispanic , k = len(medical_White)) + choices(nonmedical_Hispanic , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
ratios = []
for tok in supported_tokens:
    rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
    ratios.append((tok, rat))

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Hispanic patients
ratios[:50]

In [None]:
# The following words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare Asian vs. White
A ratio > 1 means the odds of the word occurring for an Asian patient is greater than the odds of the word occurring for a White patient.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Asian , k = len(medical_White)) + choices(nonmedical_Asian , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
ratios = []
for tok in supported_tokens:
    rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
    ratios.append((tok, rat))

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Asian patients
ratios[:50]

In [None]:
# The following words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare Other vs. White
A ratio > 1 means the odds of the word occurring for a Other patient is greater than the odds of the word occurring for a White patient.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Other , k = len(medical_White)) + choices(nonmedical_Other , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
ratios = []
for tok in supported_tokens:
    rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
    ratios.append((tok, rat))

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Other patients
ratios[:50]

In [None]:
# The following words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 1b: Compute z-scores and 95% confidence intervals for the log odds ratios for medical and nonmedical words combined

### Compare Black vs. White
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Black , k = len(medical_White)) + choices(nonmedical_Black , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Black patients
scores[:50]

In [None]:
# The following words had higher odds of occurring for White patients
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Black , k = len(medical_White)) + choices(nonmedical_Black , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Compare Hispanic vs White

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Hispanic , k = len(medical_White)) + choices(nonmedical_Hispanic , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Hispanic patients
scores[:50]

In [None]:
# The following words had higher odds of occurring for White patients
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Hispanic , k = len(medical_White)) + choices(nonmedical_Hispanic , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Compare Asian vs. White
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Asian , k = len(medical_White)) + choices(nonmedical_Asian , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Asian patients
scores[:50]

In [None]:
# The following words had higher odds of occurring for White patients
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Asian , k = len(medical_White)) + choices(nonmedical_Asian , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Compare Other vs. White
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Other , k = len(medical_White)) + choices(nonmedical_Other , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Other patients
scores[:50]

In [None]:
# The following words had higher odds of occurring for White patients
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Other , k = len(medical_White)) + choices(nonmedical_Other , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White + nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

# **2a:** Compute log OR for medical words

### Redefine the background corpus to only include medical words

In [None]:
c_bg = dict(sorted(Counter(medical_words_list).items(), key=lambda x: x[1], reverse=True))
size3 = _size(c_bg)

### Compare for Black vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Black , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following medical words had higher odds of occurring for Black patients
ratios[:50]

In [None]:
# The following medical words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare Hispanic vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Hispanic , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following medical words had higher odds of occurring for Hispanic patients
ratios[:50]

In [None]:
# The following medical words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare for Asian vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Asian , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following medical words had higher odds of occurring for Asian patients
ratios[:50]

In [None]:
# The following medical words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare for Other vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Other , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following medical words had higher odds of occurring for Other patients
ratios[:50]

In [None]:
# The following medical words had higher odds of occurring for White patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 2b: Compute z-scores and 95% confidence intervals for the log odds ratios for medical words

### Compare Black vs. White
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Black , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Black patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Black , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Compare Hispanic vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Hispanic , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Hispanic patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Hispanic , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Compare Asian vs. White
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Asian , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Asian patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Asian , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Compare Other vs. White
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Other , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Other patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(medical_Other , k = len(medical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

# **3a:** Compute log OR for nonmedical words

### Redefine the background corpus to only include nonmedical words

In [None]:
c_bg = dict(sorted(Counter(nonmedical_words_list).items(), key=lambda x: x[1], reverse=True))
size3 = _size(c_bg)

### Compare nonmedical words for Black vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Black , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following nonmedical words had higher odds of occurring for Black patients
ratios[:50]

In [None]:
# The following nonmedical words had higher odds of occurring for White patients 
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare nonmedical words for Hispanic vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Hispanic , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following nonmedical words had higher odds of occurring for Hispanic patients
ratios[:50]

In [None]:
# The following nonmedical words had higher odds of occurring for White patients 
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare nonmedical words for Asian vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Asian , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following nonmedical words had higher odds of occurring for Asian patients
ratios[:50]

In [None]:
# The following nonmedical words had higher odds of occurring for White patients 
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

### Compare nonmedical words for Other vs. White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Other , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        print(tok)

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following nonmedical words had higher odds of occurring for Other patients
ratios[:50]

In [None]:
# The following nonmedical words had higher odds of occurring for White patients 
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 3b: Compute z-scores and 95% confidence intervals for the log odds ratios for nonmedical words

### Black vs White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Black , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

scores = []

for tok in supported_tokens:
    try:
        score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
        scores.append((tok, score))
    except:
        print(tok)

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Black patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Black , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Hispanic vs White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Hispanic , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

scores = []

for tok in supported_tokens:
    try:
        score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
        scores.append((tok, score))
    except:
        print(tok)

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Hispanic patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Hispanic, k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Asian vs White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Asian , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

scores = []

for tok in supported_tokens:
    try:
        score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
        scores.append((tok, score))
    except:
        print(tok)

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Asian patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Asian , k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])

### Other vs White

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Other, k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

scores = []

for tok in supported_tokens:
    try:
        score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
        scores.append((tok, score))
    except:
        print(tok)

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Other patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for White patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in scores:
        writer.writerow([val])

In [None]:
c_1 = dict(sorted(Counter(choices(nonmedical_Other, k = len(nonmedical_White))).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_White).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
CIs_sorted[:50]

In [None]:
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in CIs_sorted:
        writer.writerow([val])