# Head and Neck Cancer Data Analysis
Jonny Edwards and Jim Moor

this notebook contains all the analysis performed on the Head and Neck cancer diagnosis data

In [71]:
import polars as pl
pl.Config(tbl_rows=-1)

<polars.config.Config at 0x7fc909af74d0>

### 1. Read in the data and provide summary statistics

Let's get all the data scoped for analysis - the following breaks the data into symptoms, metadata and final diagnosis.

In [153]:
df = pl.read_csv("../../jim3.csv")

In [115]:
df_meta = df[["Age","Ht","Weight", "BMI", 'TotalNoSymptoms']]

In [116]:
df_diagnosis = df[["Diagnostic group"]]

In [117]:
df_symptoms = df[['HoarseVoice', 'DiffSwal', 'PainSwal', 'Otalgia', 'UnilatNO', 'NeckLump', 'ThroatLump', 'FOSIT', 'SoreThroat']]

## 1a. Summary Statistics for symptoms

In [118]:
symptom_summary = df_symptoms.unpivot().group_by(pl.all()).len()

In [119]:
print(symptom_summary.sort(["variable","value"]))

shape: (27, 3)
┌─────────────┬───────┬─────┐
│ variable    ┆ value ┆ len │
│ ---         ┆ ---   ┆ --- │
│ str         ┆ i64   ┆ u32 │
╞═════════════╪═══════╪═════╡
│ DiffSwal    ┆ null  ┆ 1   │
│ DiffSwal    ┆ 0     ┆ 185 │
│ DiffSwal    ┆ 1     ┆ 86  │
│ FOSIT       ┆ null  ┆ 2   │
│ FOSIT       ┆ 0     ┆ 148 │
│ FOSIT       ┆ 1     ┆ 122 │
│ HoarseVoice ┆ null  ┆ 1   │
│ HoarseVoice ┆ 0     ┆ 158 │
│ HoarseVoice ┆ 1     ┆ 113 │
│ NeckLump    ┆ null  ┆ 1   │
│ NeckLump    ┆ 0     ┆ 188 │
│ NeckLump    ┆ 1     ┆ 83  │
│ Otalgia     ┆ null  ┆ 2   │
│ Otalgia     ┆ 0     ┆ 229 │
│ Otalgia     ┆ 1     ┆ 41  │
│ PainSwal    ┆ null  ┆ 1   │
│ PainSwal    ┆ 0     ┆ 214 │
│ PainSwal    ┆ 1     ┆ 57  │
│ SoreThroat  ┆ null  ┆ 1   │
│ SoreThroat  ┆ 0     ┆ 181 │
│ SoreThroat  ┆ 1     ┆ 90  │
│ ThroatLump  ┆ null  ┆ 1   │
│ ThroatLump  ┆ 0     ┆ 219 │
│ ThroatLump  ┆ 1     ┆ 52  │
│ UnilatNO    ┆ null  ┆ 1   │
│ UnilatNO    ┆ 0     ┆ 231 │
│ UnilatNO    ┆ 1     ┆ 40  │
└─────────────┴───────┴──

## 1b. Summary of the metadata statistics

In [121]:
print(df_meta.describe())

shape: (9, 6)
┌────────────┬───────────┬──────────┬───────────┬───────────┬─────────────────┐
│ statistic  ┆ Age       ┆ Ht       ┆ Weight    ┆ BMI       ┆ TotalNoSymptoms │
│ ---        ┆ ---       ┆ ---      ┆ ---       ┆ ---       ┆ ---             │
│ str        ┆ f64       ┆ f64      ┆ f64       ┆ f64       ┆ f64             │
╞════════════╪═══════════╪══════════╪═══════════╪═══════════╪═════════════════╡
│ count      ┆ 271.0     ┆ 271.0    ┆ 271.0     ┆ 271.0     ┆ 272.0           │
│ null_count ┆ 1.0       ┆ 1.0      ┆ 1.0       ┆ 1.0       ┆ 0.0             │
│ mean       ┆ 53.749739 ┆ 1.66903  ┆ 78.612731 ┆ 28.162249 ┆ 2.514706        │
│ std        ┆ 17.539717 ┆ 0.102715 ┆ 18.971041 ┆ 6.200982  ┆ 1.683369        │
│ min        ┆ 9.130732  ┆ 1.41     ┆ 28.95     ┆ 14.561642 ┆ 0.0             │
│ 25%        ┆ 41.64271  ┆ 1.59     ┆ 65.4      ┆ 23.620288 ┆ 1.0             │
│ 50%        ┆ 53.36345  ┆ 1.66     ┆ 74.55     ┆ 27.155556 ┆ 2.0             │
│ 75%        ┆ 66.992471 ┆

## 1c. Summary of the diagnosis data

In [93]:
print(df_diagnosis.unpivot().group_by(pl.all()).len())
df_diagnosis_updated = df_diagnosis=="Cancer"
print(df_diagnosis_updated.unpivot().group_by(pl.all()).len())

shape: (6, 3)
┌──────────────────┬─────────────────────────────────┬─────┐
│ variable         ┆ value                           ┆ len │
│ ---              ┆ ---                             ┆ --- │
│ str              ┆ str                             ┆ u32 │
╞══════════════════╪═════════════════════════════════╪═════╡
│ Diagnostic group ┆ ?                               ┆ 1   │
│ Diagnostic group ┆ Cancer                          ┆ 12  │
│ Diagnostic group ┆ RIP                             ┆ 1   │
│ Diagnostic group ┆ Not confirmed                   ┆ 1   │
│ Diagnostic group ┆ Non-cancer                      ┆ 256 │
│ Diagnostic group ┆ Cancer (diagnosed prior to 2ww… ┆ 1   │
└──────────────────┴─────────────────────────────────┴─────┘
shape: (2, 3)
┌──────────────────┬───────┬─────┐
│ variable         ┆ value ┆ len │
│ ---              ┆ ---   ┆ --- │
│ str              ┆ bool  ┆ u32 │
╞══════════════════╪═══════╪═════╡
│ Diagnostic group ┆ false ┆ 260 │
│ Diagnostic group ┆ true  ┆ 1

## 2. Avg statistics per diagnostic group

In [151]:
df = df.with_columns(
    Cancer = (df["Diagnostic group"]=="Cancer")
)

In [113]:
print(df.group_by("Cancer").agg(pl.col("Age").mean().alias("Means"),pl.col("Age").std().alias("Stds")))

shape: (2, 3)
┌────────┬───────────┬───────────┐
│ Cancer ┆ Means     ┆ Stds      │
│ ---    ┆ ---       ┆ ---       │
│ bool   ┆ f64       ┆ f64       │
╞════════╪═══════════╪═══════════╡
│ true   ┆ 55.011864 ┆ 18.841348 │
│ false  ┆ 53.691262 ┆ 17.513925 │
└────────┴───────────┴───────────┘


In [126]:
print(df.group_by("Cancer").agg(pl.col("BMI").mean().alias("Means"),pl.col("BMI").std().alias("Stds")))

shape: (2, 3)
┌────────┬───────────┬──────────┐
│ Cancer ┆ Means     ┆ Stds     │
│ ---    ┆ ---       ┆ ---      │
│ bool   ┆ f64       ┆ f64      │
╞════════╪═══════════╪══════════╡
│ true   ┆ 26.355514 ┆ 4.064065 │
│ false  ┆ 28.245959 ┆ 6.275156 │
└────────┴───────────┴──────────┘


In [141]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

pca_x = df_meta.fill_null(0).to_numpy()
#o = np.isnan(pca_x)
#pca_x = pca_x[o] = 0
pca = PCA(n_components=5)
pca.fit(pca_x)
components = pca.fit_transform(pca_x)
var_values = pca.explained_variance_ratio_

In [150]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook # enables plot interface in J notebook
output_notebook()

p = figure(width=400, height=400)
p.title = "First two principal components"
p.xaxis.axis_label = "First Principal Component"
p.yaxis.axis_label = "Second Principal Component"
# add a scatter circle renderer with a size, color, and alpha
p.scatter(components[:,0], components[:,1], size=20, color=["yellow" if i  else "navy" for i in df["Cancer"]], alpha=0.3)
# show the results
show(p)


In [152]:
var_values*100

array([5.59198905e+01, 4.24822965e+01, 1.22012855e+00, 3.76390434e-01,
       1.29403314e-03])