# Head and Neck Cancer Data Analysis
Jonny Edwards and Jim Moor

this notebook contains all the analysis performed on the Head and Neck cancer diagnosis data

In [86]:
import polars as pl
pl.Config(tbl_rows=-1)

<polars.config.Config at 0x7f13b1957500>

### 1. Read in the data and provide summary statistics

Let's get all the data scoped for analysis - the following breaks the data into symptoms, metadata and final diagnosis.

In [87]:
df = pl.read_csv("maindata.csv")

In [88]:
df_meta = df[["Age","Ht","Weight", "BMI", 'TotalNoSymptoms']]

In [89]:
df_diagnosis = df[["Diagnostic group"]]

In [90]:
df_symptoms = df[['HoarseVoice', 'DiffSwal', 'PainSwal', 'Otalgia', 'UnilatNO', 'NeckLump', 'ThroatLump', 'FOSIT', 'SoreThroat']]

## 1a. Summary Statistics for symptoms

In [91]:
symptom_summary = df_symptoms.unpivot().group_by(pl.all()).len()

In [92]:
print(symptom_summary.sort(["variable","value"]))

shape: (27, 3)
┌─────────────┬───────┬─────┐
│ variable    ┆ value ┆ len │
│ ---         ┆ ---   ┆ --- │
│ str         ┆ i64   ┆ u32 │
╞═════════════╪═══════╪═════╡
│ DiffSwal    ┆ null  ┆ 1   │
│ DiffSwal    ┆ 0     ┆ 281 │
│ DiffSwal    ┆ 1     ┆ 162 │
│ FOSIT       ┆ null  ┆ 2   │
│ FOSIT       ┆ 0     ┆ 228 │
│ FOSIT       ┆ 1     ┆ 214 │
│ HoarseVoice ┆ null  ┆ 1   │
│ HoarseVoice ┆ 0     ┆ 237 │
│ HoarseVoice ┆ 1     ┆ 206 │
│ NeckLump    ┆ null  ┆ 1   │
│ NeckLump    ┆ 0     ┆ 297 │
│ NeckLump    ┆ 1     ┆ 146 │
│ Otalgia     ┆ null  ┆ 2   │
│ Otalgia     ┆ 0     ┆ 337 │
│ Otalgia     ┆ 1     ┆ 105 │
│ PainSwal    ┆ null  ┆ 1   │
│ PainSwal    ┆ 0     ┆ 325 │
│ PainSwal    ┆ 1     ┆ 118 │
│ SoreThroat  ┆ null  ┆ 1   │
│ SoreThroat  ┆ 0     ┆ 255 │
│ SoreThroat  ┆ 1     ┆ 188 │
│ ThroatLump  ┆ null  ┆ 1   │
│ ThroatLump  ┆ 0     ┆ 343 │
│ ThroatLump  ┆ 1     ┆ 100 │
│ UnilatNO    ┆ null  ┆ 1   │
│ UnilatNO    ┆ 0     ┆ 348 │
│ UnilatNO    ┆ 1     ┆ 95  │
└─────────────┴───────┴──

## 1b. Summary of the metadata statistics

In [93]:
print(df_meta.describe())

shape: (9, 6)
┌────────────┬───────────┬──────────┬───────────┬───────────┬─────────────────┐
│ statistic  ┆ Age       ┆ Ht       ┆ Weight    ┆ BMI       ┆ TotalNoSymptoms │
│ ---        ┆ ---       ┆ ---      ┆ ---       ┆ ---       ┆ ---             │
│ str        ┆ f64       ┆ f64      ┆ f64       ┆ f64       ┆ f64             │
╞════════════╪═══════════╪══════════╪═══════════╪═══════════╪═════════════════╡
│ count      ┆ 271.0     ┆ 443.0    ┆ 443.0     ┆ 443.0     ┆ 272.0           │
│ null_count ┆ 173.0     ┆ 1.0      ┆ 1.0       ┆ 1.0       ┆ 172.0           │
│ mean       ┆ 53.762696 ┆ 1.667409 ┆ 82.342551 ┆ 28.378281 ┆ 2.514706        │
│ std        ┆ 17.537169 ┆ 0.098936 ┆ 52.996724 ┆ 6.315923  ┆ 1.683369        │
│ min        ┆ 9.130732  ┆ 1.39     ┆ 28.95     ┆ 14.561642 ┆ 0.0             │
│ 25%        ┆ 41.64271  ┆ 1.6      ┆ 65.7      ┆ 23.977078 ┆ 1.0             │
│ 50%        ┆ 53.36345  ┆ 1.66     ┆ 75.4      ┆ 27.290303 ┆ 2.0             │
│ 75%        ┆ 66.992471 ┆

## 1c. Summary of the diagnosis data

In [94]:
print(df_diagnosis.unpivot().group_by(pl.all()).len())
df_diagnosis_updated = df_diagnosis=="Cancer"
print(df_diagnosis_updated.unpivot().group_by(pl.all()).len())

shape: (6, 3)
┌──────────────────┬─────────────────────────────────┬─────┐
│ variable         ┆ value                           ┆ len │
│ ---              ┆ ---                             ┆ --- │
│ str              ┆ str                             ┆ u32 │
╞══════════════════╪═════════════════════════════════╪═════╡
│ Diagnostic group ┆ Non-cancer                      ┆ 420 │
│ Diagnostic group ┆ RIP                             ┆ 1   │
│ Diagnostic group ┆ Not confirmed                   ┆ 1   │
│ Diagnostic group ┆ Cancer                          ┆ 20  │
│ Diagnostic group ┆ ?                               ┆ 1   │
│ Diagnostic group ┆ Cancer (diagnosed prior to 2ww… ┆ 1   │
└──────────────────┴─────────────────────────────────┴─────┘
shape: (2, 3)
┌──────────────────┬───────┬─────┐
│ variable         ┆ value ┆ len │
│ ---              ┆ ---   ┆ --- │
│ str              ┆ bool  ┆ u32 │
╞══════════════════╪═══════╪═════╡
│ Diagnostic group ┆ true  ┆ 20  │
│ Diagnostic group ┆ false ┆ 4

## 2. Avg statistics per diagnostic group

In [95]:
df = df.with_columns(
    Cancer = (df["Diagnostic group"]=="Cancer")
)

In [96]:
print(df.group_by("Cancer").agg(pl.col("Age").mean().alias("Means"),pl.col("Age").std().alias("Stds")))

shape: (2, 3)
┌────────┬───────────┬───────────┐
│ Cancer ┆ Means     ┆ Stds      │
│ ---    ┆ ---       ┆ ---       │
│ bool   ┆ f64       ┆ f64       │
╞════════╪═══════════╪═══════════╡
│ true   ┆ 55.011864 ┆ 18.841348 │
│ false  ┆ 53.70482  ┆ 17.5113   │
└────────┴───────────┴───────────┘


In [97]:
print(df.group_by("Cancer").agg(pl.col("BMI").mean().alias("Means"),pl.col("BMI").std().alias("Stds")))

shape: (2, 3)
┌────────┬───────────┬──────────┐
│ Cancer ┆ Means     ┆ Stds     │
│ ---    ┆ ---       ┆ ---      │
│ bool   ┆ f64       ┆ f64      │
╞════════╪═══════════╪══════════╡
│ true   ┆ 27.210646 ┆ 4.215683 │
│ false  ┆ 28.433488 ┆ 6.396375 │
└────────┴───────────┴──────────┘


In [98]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

pca_x = df_meta.fill_null(0).to_numpy()
#o = np.isnan(pca_x)
#pca_x = pca_x[o] = 0
pca = PCA(n_components=5)
pca.fit(pca_x)
components = pca.fit_transform(pca_x)
var_values = pca.explained_variance_ratio_

In [99]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook # enables plot interface in J notebook
output_notebook()

p = figure(width=400, height=400)
p.title = "First two principal components"
p.xaxis.axis_label = "First Principal Component"
p.yaxis.axis_label = "Second Principal Component"
# add a scatter circle renderer with a size, color, and alpha
p.scatter(components[:,0], components[:,1], size=20, color=["yellow" if i  else "navy" for i in df["Cancer"]], alpha=0.3)
# show the results
show(p)
