# NLST Demographic Info

Handle the demographic information columns (there's so many) before examining performance.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import json

from utilities.info import *
from utilities.predictions import sybil_label

DICT_PATH = f"{INPUT_DIR}/participant_d040722.csv"
FILE_DIR

## Load Demographic Info and Choose Rows Based on Patients

Venk21 data. Our main dataset, including validation predictions.

In [None]:
PREDS_PATH = f"{FILE_DIR}/nlst_allmodels.csv"
INCLUDES_VENK21_DATA = True

OUTDEMOS_PATH = f"{FILE_DIR}/nlst_demos.csv"
OUTMERGE_PATH = f"{FILE_DIR}/nlst_allmodels_demos.csv"
OUTCOLS_PATH = f"{FILE_DIR}/nlst_democols.json"

Sybil training data splits. For prevalence checks.

In [None]:
# PREDS_PATH = f"{INPUT_DIR}/sybil-nlst-info/sybil-nlst-splitinfo.csv"
# INCLUDES_VENK21_DATA = False

# OUTDEMOS_PATH = f"{FILE_DIR}/nlst_sybil_demos_patientlevel.csv"
# OUTMERGE_PATH = f"{FILE_DIR}/nlst_sybil_demos.csv"
# OUTCOLS_PATH = f"{FILE_DIR}/nlst_sybil_democols.json"

In [None]:
nlst_preds = pd.read_csv(PREDS_PATH)
nlst_preds.info()

In [None]:
nlst_dict_full = pd.read_csv(DICT_PATH)
nlst_dict_full.info()

In [None]:
df = nlst_dict_full[nlst_dict_full['pid'].isin(nlst_preds['PatientID'])].reset_index(drop=True)
df.info()

## Choose Demographic Columns

In [None]:
cols = {"num": {}, 'cat': {}}
df['PatientID'] = df['pid']
cols["info"] = ['PatientID', 'study']
df['study'].value_counts(dropna=False)

Next up we analyze and aggregate the columns since that still leaves us about 140 at the end.

### Demographic columns

In [None]:
col_demo_cat = ['race', 'gender', 'ethnic', 'marital', 'educat']
col_demo_num = ['age', 'height', 'weight']
df[col_demo_cat + col_demo_num].isna().sum()

In [None]:
print(df['educat'].value_counts(dropna=False))
df['educat'].replace([8, 95, 98, 99], value=np.nan, inplace=True)
print(df['educat'].value_counts(dropna=False))

In [None]:
df['HS-or-more'] = df['educat'].apply(lambda x: 1 if x >=3 else (0 if x < 3 else None))
col_demo_cat.append('HS-or-more')
df['HS-or-more'].value_counts(dropna=False)

In [None]:
df['more-than-HS'] = df['educat'].apply(lambda x: 1 if x >=4 else (0 if x < 4 else None))
col_demo_cat.append('more-than-HS')
df['more-than-HS'].value_counts(dropna=False)

In [None]:
print(df['ethnic'].value_counts(dropna=False))
df['ethnic'].replace([7, 95, 98, 99], value=np.nan, inplace=True)
print(df['ethnic'].value_counts(dropna=False))

In [None]:
print(df['marital'].value_counts(dropna=False))
df['marital'].replace([7, 9], value=np.nan, inplace=True)
print(df['marital'].value_counts(dropna=False))

In [None]:
print(df['race'].value_counts(dropna=False))
df['race'].replace([7, 95, 96, 98, 99], value=np.nan, inplace=True)
print(df['race'].value_counts(dropna=False))

In [None]:
df.query('ethnic == 1')['race'].value_counts(dropna=False)

In [None]:
df['NonHispanicWhite'] = (df['race'] == 1.0) & (df['ethnic'] == 2.0)
col_demo_cat.append('NonHispanicWhite')
df['NonHispanicWhite'].value_counts(dropna=False)

In [None]:
df['HighSchoolPlus'] = (df['educat'] != 1.0)
col_demo_cat.append('HighSchoolPlus')
df['HighSchoolPlus'].value_counts(dropna=False)

In [None]:
df['Married'] = (df['marital'] == 2.0)
col_demo_cat.append('Married')
df['Married'].value_counts(dropna=False)

In [None]:
def bmi_calc(height, weight):
    return (weight * 703) / (height * height)

df['BMI'] = bmi_calc(height=df['height'], weight=df['weight'])
col_demo_num.append('BMI')
df['BMI'].describe()

In [None]:
df['WhiteOrBlack'] = df['race'].replace([3, 4, 5, 6], value=np.nan, inplace=False)
col_demo_cat.append('WhiteOrBlack')
df['WhiteOrBlack'].value_counts()

In [None]:
df['Overweight'] = (df['BMI'] >= 25)
col_demo_cat.append('Overweight')

In [None]:
df['Unfinished_ed'] = (df['educat'] == 2) | (df['educat'] == 5)
col_demo_cat.append('Unfinished_ed')

In [None]:
df[col_demo_cat + col_demo_num].isna().sum()

Age and gender are already included in the Venk21 sheet so we don't need it here as well.

In [None]:
cols['cat']['demo'] = list(set(col_demo_cat) - set(['gender']))
cols['num']['demo'] = list(set(col_demo_num) - set(['age']))

In [None]:
cols

### Smoking columns

In [None]:
col_smoke_cat =  ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework']
col_smoke_num = ['pkyr', 'smokeage', 'smokeday', 'smokeyr']
df[col_smoke_cat + col_smoke_num].isna().sum()

Let's not include `age_quit` - too many None values.

In [None]:
pd.DataFrame({c: df[c].value_counts(dropna=False) for c in col_smoke_cat}).drop_duplicates()

In [None]:
pd.DataFrame({c: [len(df.query(f'gender == {i} and {c} == 1')) for i in [1,2]] for c in col_smoke_cat}, index=['M','F'])

In [None]:
cols['cat']['smoke'] = col_smoke_cat
cols['num']['smoke'] = col_smoke_num

### Work history columns

In [None]:
workfacets = ['res', 'wrk', 'yrs']
worktypes = ['asbe', 'baki', 'butc', 'chem', 'coal', 'cott', 'farm', 'fire', 'flou', 'foun', 'hard', 'pain', 'sand', 'weld']
workcols = {f: [f+t for t in worktypes] for f in workfacets}

In [None]:
for t in worktypes:
    hasworked = len(df.query(f'wrk{t} == 1'))
    resnotnull = len(df[~df[f'res{t}'].isna()])
    yrsnotnull = len(df[~df[f'yrs{t}'].isna()])
    print(t, hasworked, resnotnull, yrsnotnull)

In [None]:
df[workcols['wrk']].sum(axis=1).sort_values(ascending=False)

In [None]:
pd.DataFrame({t: df[f'wrk{t}'].value_counts(dropna=False) for t in worktypes})

In [None]:
pd.DataFrame({t: [len(df.query(f'gender == {i} and wrk{t} == 1')) for i in [1,2]] for t in worktypes}, index=['M','F'])

In [None]:
pd.DataFrame({t: df[f'res{t}'].value_counts(dropna=False) for t in worktypes}).drop_duplicates()

In [None]:
for t in worktypes:
    nomask = df[f'res{t}'] == 0
    df[f'wrknomask{t}'] = nomask

df['wrknomask'] = df[[f'wrknomask{t}' for t in worktypes]].any(axis=1)

In [None]:
pd.DataFrame({t: [len(df.query(f'gender == {i} and wrknomask{t} == 1')) for i in [1,2]] for t in worktypes}, index=['M','F'])

In [None]:
df['wrknomask'].value_counts()

In [None]:
df[workcols['res']].iloc[3156]

In [None]:
df[workcols['yrs']].describe()

In [None]:
sns.boxenplot(data=df[workcols['yrs']], orient='h')

In [None]:
cols['cat']['work'] = workcols['wrk']

### Disease history columns

In [None]:
diseasefacets = ['age', 'diag']
diseasetypes = ['adas', 'asbe', 'bron', 'chas', 'chro', 'copd', 'diab', 'emph', 'fibr', 'hear', 'hype', 'pneu', 'sarc', 'sili', 'stro', 'tube']

In [None]:
pd.DataFrame({t: df[f'diag{t}'].value_counts(dropna=False) for t in diseasetypes}).drop_duplicates()

In [None]:
pd.DataFrame({t: [len(df.query(f'gender == {i} and diag{t} == 1')) for i in [1,2]] for t in diseasetypes}, index=['M','F'])

In [None]:
df[[f"age{t}" for t in diseasetypes]].describe()

In [None]:
sns.boxenplot(data=df[[f"age{t}" for t in diseasetypes]], orient='h')

In [None]:
cols['cat']['disease'] = [f'diag{t}' for t in diseasetypes]

### Personal cancer history columns

In [None]:
pcancerfacets = ['age', 'canc']
pcancertypes = ['blad', 'brea', 'cerv', 'colo', 'esop', 'kidn', 'lary', 'lung', 'nasa', 'oral', 'panc', 'phar', 'stom', 'thyr', 'tran']

In [None]:
pd.DataFrame({t: df[f'canc{t}'].value_counts(dropna=False) for t in pcancertypes}).drop_duplicates()

In [None]:
pd.DataFrame({t: [len(df.query(f'gender == {i} and canc{t} == 1')) for i in [1,2]] for t in pcancertypes}, index=['M','F'])

In [None]:
df[[f"age{t}" for t in pcancertypes]].describe()

In [None]:
sns.boxenplot(data=df[[f"age{t}" for t in pcancertypes]], orient='h')

In [None]:
cols['cat']['canchist'] = [f"canc{t}" for t in pcancertypes]

In [None]:
df['PersonalCancerHist'] = df[[f"canc{t}" for t in pcancertypes]].any(axis=1)
df['PersonalCancerHist'].value_counts(dropna=False)

### Family history Lung cancer columns
Unused because it doesn't match with existing `FamilyHistoryLungCa` column.

In [None]:
relatives = ['brother', 'sister', 'child', 'father', 'mother']
col_famhist = ['fam'+member for member in relatives]
pd.DataFrame({t: df[f'fam{t}'].value_counts(dropna=False) for t in relatives}).drop_duplicates()

In [None]:
df['famhist'] = df[col_famhist].any(axis=1)
df_famhist = df[['pid', 'famhist']].set_index('pid')
df_famhist.value_counts(dropna=False)

### Alcohol columns - UNUSED

In [None]:
alcohol_acrin = ['acrin_alc_curr', 'acrin_alc_ever', 'acrin_drink24hr', 'acrin_drinknum_curr', 'acrin_drinknum_form', 'acrin_drinkyrs_curr', 'acrin_drinkyrs_form', 'acrin_lastdrink']
alcohol_lss = ['lss_alcohol_freq', 'lss_alcohol_num']

In [None]:
df['study'].value_counts() #1 is LSS, 2 and 3 are ACRIN

In [None]:
df['acrin_alc_curr'].value_counts(dropna=False)

In [None]:
df['lss_alcohol_freq'].value_counts(dropna=False)

In [None]:
df[alcohol_acrin + alcohol_lss].describe()

I don't know how to aggregate these columns, if I'm gonna be honest.

### Lung Cancer columns

In [None]:
histology_cat_key = {
    1: "Small_cell_carcinoma",
    2: "Squamous_cell_carcinoma",
    3: "Adenocarcinoma",
    4: "Bronchiolo-alveolar_carcinoma",
    5: "Large_cell_carcinoma",
    6: "Adenosquamous_carcinoma",
    7: "Pleomorphic/sarcomatoid",
    8: "Carcinoid_tumor",
    9: "Unclassified_carcinoma",
}

In [None]:
df['LC_type'] = df['histology_cat'].replace(histology_cat_key)
df = pd.get_dummies(df, columns=['LC_type'], prefix='', prefix_sep='')

In [None]:
df['LC_stage'] = df['de_stag_7thed'].replace([999], value=np.nan, inplace=False)

In [None]:
cols['cat']['lungcanc'] = list(histology_cat_key.values()) + ['LC_stage']
cols['cat']['lungcanc'].remove("Pleomorphic/sarcomatoid")

In [None]:
cols["info"].append('lung_cancer')
df['lung_cancer'].value_counts(dropna=False)

In [None]:
cols["info"].append('cancyr')
df['cancyr'].value_counts(dropna=False)

In [None]:
cols["info"].extend([f"scr_days{f}" for f in range(3)])

In [None]:
cols["info"].append("fup_days")
cols["info"].append("candx_days")

### Other columns

In [None]:
if not INCLUDES_VENK21_DATA:
    cols['cat']['other'] = ['wrknomask', 'PersonalCancerHist', 'FamilyHistoryLungCa']

    df['Gender'] = df['gender']
    cols['cat']['demo'].append('Gender')
    
    df['Age'] = df['age']
    cols['num']['demo'].append('Age')

    df['FamilyHistoryLungCa'] = df['famhist']

else:
    cols['cat']['other'] = ['wrknomask', 'PersonalCancerHist']

## Save Columns

In [None]:
cols

In [None]:
cols_num = sum(cols['num'].values(), start=[])
cols_cat = sum(cols['cat'].values(), start=[])
cols_list = cols_num + cols_cat + cols['info']
len(cols_list)

In [None]:
if INCLUDES_VENK21_DATA:
    cols['cat']['demo'].append('Gender')
    cols['num']['demo'].append('Age')
    cols['cat']['other'] = ['wrknomask', 'PersonalCancerHist', 'FamilyHistoryLungCa', 'Emphysema']
    cols['cat']['nodule'] = ['Solid', 'GroundGlassOpacity', 'Perifissural', 'NonSolid',
       'PartSolid', 'SemiSolid', 'Calcified', 'NoduleInUpperLung', 'Spiculation']
    cols['num']['nodule'] = ["CoordX", "CoordY", "CoordZ"]
    cols['num']['other'] = ['Diameter_mm', 'NoduleCounts']

In [None]:
with open(OUTCOLS_PATH, 'w') as f:
    json.dump(cols, f)

In [None]:
df[cols_list].to_csv(OUTDEMOS_PATH, index=False)

In [None]:
nlst_preds_demos = nlst_preds.merge(df[cols_list], on='PatientID', how='left')
nlst_preds_demos.info()

### Label check

In [None]:
nlst_preds['timepoint'].value_counts()

In [None]:
if INCLUDES_VENK21_DATA:
    display(nlst_preds_demos['lung_cancer'].value_counts(dropna=False))
else:
    display(nlst_preds_demos[['split', 'lung_cancer']].value_counts(dropna=False))

In [None]:
nlst_preds_demos['sybil_label'] = nlst_preds_demos.apply(sybil_label, axis=1)
if INCLUDES_VENK21_DATA:
    display(nlst_preds_demos['sybil_label'].value_counts(dropna=False))
else:
    display(nlst_preds_demos[['split', 'sybil_label']].value_counts(dropna=False))

In [None]:
if INCLUDES_VENK21_DATA:
    display(nlst_preds_demos[['label', 'lung_cancer', 'sybil_label']].value_counts(dropna=False))
else:
    display(nlst_preds_demos['split'].value_counts(dropna=False))

### Save whole CSV

In [None]:
nlst_preds_demos.to_csv(OUTMERGE_PATH, index=False)