# File description:

This file computes the log odds ratios with Dirichlet priors comparing word frequency across patient gender. Gender in this dataset is limited to male/female. Female patient notes are compared to male patient notes.

The log odds ratios are computed using lists of all words used in the notes for patients of each gender. These lists are computed using the Create_Word_Lists file

In [None]:
### Load required packages ###
import pickle
import csv
import argparse
import logging
import math
import numpy as np
import operator
from typing import Iterator
from collections import Counter
from collections import Counter, OrderedDict

## Load lists of words from pickle files

### Load lists of MEDICAL words

In [None]:
# Load BROADBAND lists
medical_words_Female_broadband = pickle.load(open('Path_to_list', 'rb'))
medical_words_Male_broadband = pickle.load(open('Path_to_list', 'rb'))

In [None]:
# Load Thoracic lists
medical_words_Female_thor = pickle.load(open('Path_to_list', 'rb'))
medical_words_Male_thor = pickle.load(open('Path_to_list', 'rb'))

In [None]:
# Combine the lists of medical words for each gender
medical_Female = medical_words_Female_broadband + medical_words_Female_thor
medical_Male = medical_words_Male_broadband + medical_words_Male_thor

### Load lists of NON-MEDICAL words

In [None]:
# Load BROADBAND lists
nonmedical_words_Female_broadband = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_Male_broadband = pickle.load(open('Path_to_list', 'rb'))

In [None]:
# Load Thoracic lists
nonmedical_words_Female_thor = pickle.load(open('Path_to_list', 'rb'))
nonmedical_words_Male_thor = pickle.load(open('Path_to_list', 'rb'))

In [None]:
# Combine the lists of medical words for each gender
nonmedical_Female = nonmedical_words_Female_broadband + nonmedical_words_Female_thor
nonmedical_Male = nonmedical_words_Male_broadband + nonmedical_words_Male_thor

## Run log-odds ratio with Dirichlet prior

In [None]:
### The following code computes the Log Odds Ratio Informative Dirichlet Prior ###
def _size(corpus: dict) -> int:              #This counts the total number of words (including all repetitions)    
    return sum(corpus.values())

In [None]:
def _log_odds(                              #Computes the log odds (with a dirichlet prior) of a given word 
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds 



In [None]:
def _z_score(                              
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds / math.sqrt(variance)



In [None]:
def _log_odds_CI(                              
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:

    if word in set(c1.keys()):
        numerator_1 = c1[word] + bg[word]
    else:
        numerator_1 = bg[word]
    
    if word in set(c2.keys()):
        numerator_2 = c2[word] + bg[word]
    else:
        numerator_2 = bg[word]
        
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )

    variance = (1 / numerator_1) + (1 / numerator_2)
    return [round(raw_logodds - 1.96*math.sqrt(variance),3), round(raw_logodds + 1.96*math.sqrt(variance),3)]

# 0: Create background corpus

The dirichlet prior will shrink odds ratios toward their "global values" from the background corpus. Therefore, it's important to ensure that each gender is adequately represented in the background corpus. We will use all medical and nonmedical words from both genders as a background corpus. 

See Monroe paper: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

### New oversampling scheme for background corpus

Oversample the nonmedical words in the female group to match the number of nonmedical words in the male group.

Do the same for all medical words

In [None]:
# Import required packages
from random import choices
from collections import Counter, OrderedDict

nonmedical_words_list = choices(nonmedical_Female , k = len(nonmedical_Male)) + nonmedical_Male
medical_words_list = choices(medical_Female , k = len(medical_Male)) + medical_Male
c_bg = dict(sorted(Counter(nonmedical_words_list + medical_words_list).items(), key=lambda x: x[1], reverse=True))

# **1a:** Compute log-odds ratios for medical and non-medical words combined

### Compare Female vs. Male
A ratio > 1 means the odds of the word occurring for a Female patient is greater than the odds of the word occurring for a Male patient.

In [None]:
c_1 = dict(sorted(Counter(medical_Female + nonmedical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_Male + nonmedical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
ratios = []
for tok in supported_tokens:
    rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
    ratios.append((tok, rat))

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Female patients
ratios[:50]

In [None]:
# The following words had higher odds of occurring for Male patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# **1b:** Compute z-scores for medical and non-medical words combined

### Compare Female vs. Male
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(medical_Female + nonmedical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_Male + nonmedical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following words had higher odds of occurring for Female patients
scores[:50]

In [None]:
# The following words had higher odds of occurring for Male patients
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 1c: Compute the 95% confidence intervals for the log odds ratios for medical and nonmedical words combined

### Female vs Male

In [None]:
c_1 = dict(sorted(Counter(medical_Female + nonmedical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_Male + nonmedical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))


#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
# The following had higher odds of occurring for female patients
CIs_sorted[:50]

In [None]:
# the following had higher odds of occurring for male patients
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# **2a:** Compute log OR for medical words

### Re-define the background corpus to include only medical words

In [None]:
c_bg = dict(sorted(Counter(medical_words_list).items(), key=lambda x: x[1], reverse=True))

### Compare for Female vs. Male

In [None]:
c_1 = dict(sorted(Counter(medical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        continue

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following medical words had higher odds of occurring for Female patients
ratios[:50]

In [None]:
# The following medical words had higher odds of occurring for Male patients
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 2b: Compute z-scores for medical words

### Compare Female vs. Male
Account for uncertainty about the log odds ratios by using the z-score (logodds/standarddev(logodds)), per Monroe et al.: http://languagelog.ldc.upenn.edu/myl/Monroe.pdf

Compare the z-score to some threshold (commonly, 1.96). If |z-score| > 1.96, the word is significant.

In [None]:
c_1 = dict(sorted(Counter(medical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
scores = []
for tok in supported_tokens:
    score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
    scores.append((tok, score))

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Female patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for Male patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 2c: Compute 95% confidence intervals for the log odds ratios for medical words

### Female vs Male

In [None]:
c_1 = dict(sorted(Counter(medical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(medical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)
size3 = _size(c_bg)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

In [None]:
# The following had higher odds of occurring for female patients
CIs_sorted[:50]

In [None]:
# The following had higher odds of occurring for male patients
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# **3a:** Compute log OR for nonmedical words

### Redefine the background corpus to only include nonmedical words

In [None]:
from random import choices
from collections import Counter, OrderedDict
nonmedical_words_list = choices(nonmedical_Female , k = len(nonmedical_Male)) + nonmedical_Male
c_bg = dict(sorted(Counter(nonmedical_words_list).items(), key=lambda x: x[1], reverse=True))
size3 = _size(c_bg)

### Compare nonmedical words for Female vs. Male

In [None]:
c_1 = dict(sorted(Counter(nonmedical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

ratios = []

for tok in supported_tokens:
    try:
        rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
        ratios.append((tok, rat))
    except:
        #print(tok)
        continue
        

ratios.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following nonmedical words had higher odds of occurring for Female patients
ratios[:50]

In [None]:
# The following nonmedical words had higher odds of occurring for Male patients 
ratios[len(ratios)-50: len(ratios)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 3b: Compute z-scores for nonmedical words

### Female vs Male

In [None]:
c_1 = dict(sorted(Counter(nonmedical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)


supported_tokens = set(list(c_1.keys()) +list(c_2.keys()))
#supported_tokens &= c_2.keys()
#supported_tokens &= c_bg.keys()

scores = []

for tok in supported_tokens:
    try:
        score = _z_score(tok, c_1, c_2, c_bg, size1, size2, size3)
        scores.append((tok, score))
    except:
        continue

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# words with z-scores > 1.96 had higher odds of occuring for Female patients at 5% level of significance
scores[:50]

In [None]:
# words with z-scores < -1.96 had higher odds of occuring for Male patients at 5% level of significance
scores[len(scores)-50: len(scores)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])

# 3c: Compute 95% confidence intervals for the log odds ratios for nonmedical words

### Female vs Male

In [None]:
c_1 = dict(sorted(Counter(nonmedical_Female).items(), key=lambda x: x[1], reverse=True))
c_2 = dict(sorted(Counter(nonmedical_Male).items(), key=lambda x: x[1], reverse=True))

size1 = _size(c_1)
size2 = _size(c_2)

supported_tokens = set(c_1.keys())
supported_tokens &= c_2.keys()
supported_tokens &= c_bg.keys()

In [None]:
CIs = []
for tok in supported_tokens:
    CI = _log_odds_CI(tok, c_1, c_2, c_bg, size1, size2, size3)
    CIs.append((tok, CI))

#ratios.sort(key=operator.itemgetter(1), reverse=True)

#Sort to match the ordering of the z-scores above
CIs_sorted = []
CIs = dict(CIs)
for tok in dict(scores).keys():
    CIs_sorted.append((tok,CIs.get(tok)))

    

scores.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# The following had higher odds of occurring for female patients
CIs_sorted[:50]

In [None]:
# The following had higher odds of occurring for male patients
CIs_sorted[len(CIs_sorted)-50: len(CIs_sorted)]

In [None]:
with open('<Path to save this csv>', 'w') as f:
    writer = csv.writer(f)
    for val in ratios:
        writer.writerow([val])