# Linear Regression with Predictions and Demographics
*Shaurya Gaur*

Are the predictions from our models correlated with certain variables that they should (not) be?

Adapted from [this lab exercise](https://github.com/Harshita0109/Sales-Prediction/blob/master/2048035_Lab3.ipynb).

In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from scipy.stats import pearsonr, spearmanr
from IPython.display import display, Markdown

import sys
sys.path.append('../')

import utils
from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression, LogisticRegression

## directory where results are
# EXPERIMENT_DIR = f"/data/bodyct/experiments/lung-malignancy-fairness-shaurya"
# NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = TEAMS_DIR ## Comment out if not using Teams backup (aka Chansey is up :)

In [None]:
def calc_corr(df0, demo_col, models, regressor=LinearRegression()):
    allstats = []
    df = df0.dropna(subset=demo_col, axis=0)
    for m in models:
        predcol = df[models[m]]
        
        pearson_result, pearson_pval = pearsonr(df[demo_col], predcol)
        # pearson_lo, pearson_hi = pearson_result.confidence_interval(0.95)

        spearman_result, spearman_pval = spearmanr(df[demo_col], predcol)
        
        regressor.fit(np.array(df[demo_col].tolist()).reshape(-1, 1), predcol)

        allstats.append({
            'SRC': spearman_result,
            'SRC p-val': spearman_pval,
            'PCC': pearson_result,
            'PCC p-val': pearson_pval,
            # 'PCC CI-lo': pearson_lo,
            # 'PCC CI-hi': pearson_hi,
            'regressor intercept': regressor.intercept_,
            'regressor coefficient': regressor.coef_[0], 
        })
    statdf = pd.DataFrame(allstats, index=models.keys())
    return statdf

## NLST: Load data

In [11]:
nlst_preds = pd.read_csv(f"{NLST_PREDS}/nlst_demov2_allmodels.csv")
nlst_preds = utils.prep_nlst_preds(nlst_preds, scanlevel=True, sybil=True, tijmen=True)
nlst_preds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1172 entries, 4740 to 10150
Data columns (total 96 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   PatientID              1172 non-null   int64  
 1   StudyDate              1172 non-null   int64  
 2   SeriesInstanceUID      1172 non-null   object 
 3   LesionID               1172 non-null   int64  
 4   Spiculation            1172 non-null   bool   
 5   Diameter [mm]          1172 non-null   float64
 6   Age                    1172 non-null   int64  
 7   Gender                 1172 non-null   int64  
 8   FamilyHistoryLungCa    1172 non-null   bool   
 9   Emphysema              1172 non-null   bool   
 10  NoduleInUpperLung      1172 non-null   bool   
 11  NoduleCounts           1172 non-null   int64  
 12  SCT_EPI_LOC            1119 non-null   float64
 13  xie_gc_gclobe150       53 non-null     float64
 14  loclup                 127 non-null    float64
 15  locru

In [12]:
with open(f'{NLST_PREDS}/nlst_demo_v1_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

In [13]:
nlst_democols['num']

{'demo': ['height', 'weight', 'Age'],
 'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr']}

In [14]:
MODEL_TO_COL = {
    "Venkadesh": "DL",
    "de Haas Combined": "Thijmen_mean",
    # "de Haas Local": "Thijmen_local",
    # "de Haas Global (hidden nodule)": "Thijmen_global_hidden",
    "de Haas Global (shown nodule)": "Thijmen_global_show",
    "Sybil": "sybil_year1",
    "PanCan2b": "PanCan2b",
    "Reference Standard": 'label'
}

## NLST: How are numerical demographic columns related with predictions?

### Demographic columns

In [15]:
for c in nlst_democols['num']['demo']:
    display(Markdown(f"#### {c}"))
    scatplot_list = list(MODEL_TO_COL.values())
    scatplot_list.remove('label')

    display(calc_corr(nlst_preds, c, MODEL_TO_COL))
    sns.histplot(nlst_preds, x=c, bins=25, hue="label", multiple='stack')
    sns.pairplot(nlst_preds, x_vars=scatplot_list, y_vars=c, kind='scatter', height=4, aspect=1, hue='label', plot_kws={'alpha': 0.5})
    plt.show()

    for c2 in ['NonHispanicWhite', 'Gender', 'HighSchoolPlus', 'Married', 'Emphysema', 'PersonalCancerHist', 'FamilyHistoryLungCa']:
        display(Markdown(f"#### {c} by {c2}"))
        groups = nlst_preds.groupby(c2)
        for label, df_group in groups:    
            print(label)
            sns.pairplot(df_group, x_vars=scatplot_list, y_vars=c, kind='scatter', height=4, aspect=1, hue='label', plot_kws={'alpha': 0.5})
            plt.show()

#### height

AttributeError: 'numpy.float64' object has no attribute 'confidence_interval'

In [None]:
mlr = LinearRegression()
mlr_df = nlst_preds.dropna(subset=nlst_democols['num']['demo'], axis=0)
coefs = {}

for m in MODEL_TO_COL:
    mlr.fit(mlr_df[nlst_democols['num']['demo']], mlr_df[MODEL_TO_COL[m]])
    coefs[m] = mlr.coef_

pd.DataFrame(coefs, index=nlst_democols['num']['demo']).T

### smoking columns

In [None]:
for c in nlst_democols['num']['smoke']:
    display(Markdown(f"#### {c}"))
    scatplot_list = list(MODEL_TO_COL.values())
    scatplot_list.remove('label')

    display(calc_corr(nlst_preds, c, MODEL_TO_COL))
    sns.histplot(nlst_preds, x=c, bins=25, hue="label", multiple='stack')
    sns.pairplot(nlst_preds, x_vars=scatplot_list, y_vars=c, kind='scatter', height=4, aspect=1, hue='label', plot_kws={'alpha': 0.5})
    plt.show()

    for c2 in ['NonHispanicWhite', 'Gender', 'HighSchoolPlus', 'Married', 'Emphysema', 'PersonalCancerHist', 'FamilyHistoryLungCa']:
        display(Markdown(f"#### {c} by {c2}"))
        groups = nlst_preds.groupby(c2)
        for label, df_group in groups:    
            print(label)
            sns.pairplot(df_group, x_vars=scatplot_list, y_vars=c, kind='scatter', height=4, aspect=1, hue='label', plot_kws={'alpha': 0.5})
            plt.show()

In [None]:
mlr = LinearRegression()
mlr_df = nlst_preds.dropna(subset=nlst_democols['num']['smoke'], axis=0)
coefs = {}

for m in MODEL_TO_COL:
    mlr.fit(mlr_df[nlst_democols['num']['smoke']], mlr_df[MODEL_TO_COL[m]])
    coefs[m] = mlr.coef_

pd.DataFrame(coefs, index=nlst_democols['num']['smoke']).T

## DLCST: load data

In [None]:
dlcst_preds = pd.read_csv(f"{EXPERIMENT_DIR}/dlcst/dlcst_thijmen_kiran_sybil_malignancy_estimation_results.csv", header=0)
dlcst_preds.info()

In [None]:
MODEL_TO_COL_DLCST = {
    "Venkadesh": "Ensemble_Kiran",
    "de Haas": "thijmen_mean",
    "Sybil": "sybil_year1",
    "PanCan2b": "PanCan2b",
}

In [None]:
dlcst_democols = ['Age', 'Sex', 'FamilyHistoryLungCa', 'Emphysema', 'NoduleCountPerScan']

## DLCST: How are the columns related with predictions?

In [None]:
for c in dlcst_democols:
    print(f"{c}: {dlcst_preds[c].isna().sum()} null values")
    display(calc_corr(dlcst_preds, c, MODEL_TO_COL_DLCST))
    sns.histplot(dlcst_preds, x=c, bins=25, hue="label", multiple='stack')
    sns.pairplot(dlcst_preds, x_vars=MODEL_TO_COL_DLCST.values(), y_vars=c, kind='scatter', height=4, aspect=1)
    plt.show()

In [None]:
mlr = LinearRegression()
mlr_df = dlcst_preds.dropna(subset=dlcst_democols, axis=0)
coefs = {}

for m in MODEL_TO_COL_DLCST:
    mlr.fit(mlr_df[dlcst_democols], mlr_df[MODEL_TO_COL_DLCST[m]])
    coefs[m] = mlr.coef_

pd.DataFrame(coefs, index=dlcst_democols).T