# Biol 359A  | Descriptive statistics and comparing groups
### Spring 2022, Week 2
<hr>

Objectives:
-  Read basic python syntax
-  Run and interpret a t-test
-  Gain intuition about statistical tests and sample sizes


A couple of bash commands to make google colab work:

In [None]:
!git clone https://github.com/BIOL359A-FoundationsOfQBio-Spr22/week2_statisticaltests
!mkdir ./data
!cp week2_statisticaltests/data/* ./data
!cp week2_statisticaltests/clean_data.py ./

### Import statements

Import statements are used to integrate *external code or packages* into our analysis. 

- `pandas`: Represents data as tables
- `seaborn`: Data exploration visualization tool
- `ipywidgets`: Notebook widgets that add user interfaces to notebooks
- `random`: Generate random numbers
- `numpy`: General math/matrices package
- `matplotlib`: Data visualization software
- `Scipy`: General scientific computing

Using `as` will alias the package in the code.
`matplotlib.pyplot` is importing the submodule `pyplot` from `matplotlib`. 
`from scipy.stats` is telling python where to find `ttest_ind`. 

In [None]:
import pandas as pd
import seaborn as sns
import ipywidgets as widgets
import random
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind as ttest

sns.set_context("notebook")
sns.set_style("whitegrid")

### Global Variables

These variables are designed accessible in any part of the code. These are normally formatted to be `TITLE_CASE`

In [None]:
#You can adjust these font sizes safely if you have a hard time reading
TITLE_FONT = 20
LABEL_FONT = 16
TICK_FONT = 16
FIG_SIZE = (15,15)
COLORS=[ "#1e81b0","#e28743"]


### Common data visualizations

First we are going to create a toy dataset using a random number generator. 
Normal distribution function:


$$
  f(x) = \frac{1}{\sigma\sqrt{2\pi}} 
  \exp\left( -\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^{\!2}\,\right)
$$

In [None]:
def generate_normal_distribution(n = 100, mu = 6, sigma = 2):
    return [random.gauss(mu, sigma) for _ in range(0,n)]

In [None]:
def plot_a_histogram(x, n, mu, sigma):
    """Generate annotated histogram based on normal (gaussian) distribution"""
    data_color = COLORS[0]
    annotation_color= COLORS[1]
    plt.figure(figsize=FIG_SIZE)
    sns.histplot(x, color=data_color, kde=True, stat="probability")
    _, xmax, _, ymax = plt.axis()
    plt.title("Histogram of random values generated from a normal distribution", fontsize=TITLE_FONT)
    
    plt.axvline(mu, linestyle='--', color=annotation_color, lw=3)
    plt.text(mu, .97*ymax, r' $\mu$', color=annotation_color, ma="left")
    
    plt.axvline(mu+sigma, linestyle=':',color=annotation_color, lw=3)
    plt.text(mu+sigma, .97*ymax, r' $\mu+\sigma$', color=annotation_color, fontsize=LABEL_FONT, ma="left")

    plt.axvline(mu-sigma, linestyle=':',color=annotation_color, lw=3)
    plt.text(mu-sigma, .97*ymax, r' $\mu-\sigma$', color=annotation_color, fontsize=LABEL_FONT, ma="right")
    
    plt.text(.8*xmax, .9*ymax, 'Data', color=data_color, fontsize=LABEL_FONT, weight="bold")
    plt.text(.8*xmax, .93*ymax, 'Underlying distribution', color=annotation_color, fontsize=LABEL_FONT, weight="bold")

    plt.show()
 
@widgets.interact_manual(n=(3,1000), mu=(-10, 10), sigma=(0,10))
def create_histplot(n=100, mu=6, sigma=2): 
    toy_dataset_x = generate_normal_distribution(n=n, mu=mu, sigma=sigma)
    plot_a_histogram(toy_dataset_x, n, mu, sigma)

__Histograms__ use discrete bins (range of values) to categorize data, and are used primarily to visualize probability or proportions. Most visualizations of probability are based on histogram-like structures, and the kernel density estimate (KDE) line is the best guess at an underlying distribution. The higher the bar in a bin, the more times a value occurs in that bin within a dataset.

In [None]:
def plot_a_boxplot(x, n, mu, sigma):
    """Generate annotated boxplot based on normal (gaussian) distribution"""
    data_color = COLORS[0]
    annotation_color= COLORS[1]
    plt.figure(figsize=FIG_SIZE)
    sns.boxplot(data=x, color=data_color)
    _, xmax, _, ymax = plt.axis()
    plt.title("Histogram of random values generated from a normal distribution", fontsize=TITLE_FONT)
    
    plt.axvline(mu, linestyle='--', color=annotation_color, lw=3)
    plt.text(mu, .97*ymax, r' $\mu$', color=annotation_color, ma="left")
    
    plt.axvline(mu+sigma, linestyle=':',color=annotation_color, lw=3)
    plt.text(mu+sigma, .97*ymax, r' $\mu+\sigma$', color=annotation_color, fontsize=LABEL_FONT, ma="left")

    plt.axvline(mu-sigma, linestyle=':',color=annotation_color, lw=3)
    plt.text(mu-sigma, .97*ymax, r' $\mu-\sigma$', color=annotation_color, fontsize=LABEL_FONT, ma="right")
    
    plt.text(.75*xmax, .9*ymax, 'Data', color=data_color, fontsize=LABEL_FONT, weight="bold")
    plt.text(.75*xmax, .93*ymax, 'Underlying distribution', color=annotation_color, fontsize=LABEL_FONT, weight="bold")

    plt.show()    
    
    
@widgets.interact_manual(n=(3,1000), mu=(-10, 10), sigma=(0,10))
def create_boxplot(n=100, mu=6, sigma=2): 
    toy_dataset_x = generate_normal_distribution(n=n, mu=mu, sigma=sigma)
    plot_a_boxplot(toy_dataset_x, n, mu, sigma)

In [None]:
def plot_a_scatterplot(toy_dataset_x, toy_dataset_y, n, mu_x, sigma_x, mu_y, sigma_y):
    pass
    
@widgets.interact_manual(n=(3,1000), mu_x=(-10, 10), sigma_x=(0,10), mu_y=(-10, 10), sigma_y=(0,10))
def create_scatterplot(n=100, mu_x=6, sigma_x=2, mu_y=6, sigma_y=2): 
    toy_dataset_x = generate_normal_distribution(n=n, mu=mu_x, sigma=sigma_x)
    toy_dataset_x = generate_normal_distribution(n=n, mu=mu_y, sigma=sigma_y)
    plot_a_scatterplot(toy_dataset_x, toy_dataset_y, n, mu_x, sigma_x, mu_y, sigma_y)

### Understanding our data

We are going to import a dataset using another python script. The python script is loading the file and doing some basic cleaning of parts of the dataset we aren't using. It can be found in `clean_data.py`.

In [None]:
import clean_data #helper function with 

cancer_dataset = clean_data.generate_clean_dataframe()
cancer_dataset

### We have a basic understanding of the structure of the data now. 

From the data source: Wisconsin Diagnostic Breast Cancer (WDBC)

```
	Features are computed from a digitized image of a fine needle
	aspirate (FNA) of a breast mass.  They describe
	characteristics of the cell nuclei present in the image.
	A few of the images can be found at
	http://www.cs.wisc.edu/~street/images/

	Separating plane described above was obtained using
	Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
	Construction Via Linear Programming." Proceedings of the 4th
	Midwest Artificial Intelligence and Cognitive Science Society,
	pp. 97-101, 1992], a classification method which uses linear
	programming to construct a decision tree.  Relevant features
	were selected using an exhaustive search in the space of 1-4
	features and 1-3 separating planes.

	The actual linear program used to obtain the separating plane
	in the 3-dimensional space is that described in:
	[K. P. Bennett and O. L. Mangasarian: "Robust Linear
	Programming Discrimination of Two Linearly Inseparable Sets",
	Optimization Methods and Software 1, 1992, 23-34].
    
    Source:
    W.N. Street, W.H. Wolberg and O.L. Mangasarian 
	Nuclear feature extraction for breast tumor diagnosis.
	IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
	and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
```

What do all the column names mean?

- ID number
- Diagnosis (M = malignant, B = benign)

Ten real-valued features are computed for each cell nucleus:

- radius (mean of distances from center to points on the perimeter)
- texture (standard deviation of gray-scale values)
- perimeter
- area
- smoothness (local variation in radius lengths)
- compactness (perimeter^2 / area - 1.0)
- concavity (severity of concave portions of the contour)
- concave points (number of concave portions of the contour)
- symmetry 
- fractal dimension ("coastline approximation" - 1)


Cateogory Distribution: 357 benign, 212 malignant

If we wanted to show the first five values in the table:

In [None]:
cancer_dataset["mean_area"].head(5)

If we wanted to show the first five values from the groups in the diagnosis column:

In [None]:
cancer_dataset["mean_area"].groupby("diagnosis").head(5)

If we wanted to calculate basic descriptive statistics:

In [None]:
def calculate_count(x):
    return len(x)

def calculate_mean(x):
    return np.sum(x) / len(x)

def calculate_variance(x):
    return calculate_mean((x - calculate_mean(x))**2)

def calculate_std(x):
    return np.sqrt(calculate_variance(x))

area = cancer_dataset["mean_area"]

print(f"count =  {calculate_count(area):.0f}")
print(f"mean =  {calculate_mean(area):.2f}")
print(f"var =  {calculate_variance(area):.2f}")
print(f"std =  {calculate_std(area):.2f}")


`pandas` has us covered:

In [None]:
cancer_dataset["mean_area"].describe()

Notice a difference? The standard deviation is being calculated differently.

In [None]:
def calculate_std_est(x):
    """unbiased estimator"""
    return np.sqrt(np.sum((x - calculate_mean(x))**2)/(len(x)-1))

print(f"unbiased std =  {calculate_std_est(area):.2f}")

What if we're interested in the relationship between variables?

In [None]:
###########################################
#                                         #
# You don't need to understand this code  #
#                                         #
###########################################


# Create scatter plots of the various features
def calculate_correlation(x,y):
    return calculate_mean((x - calculate_mean(x)).transpose() * (y - calculate_mean(y))) / np.sqrt(calculate_variance(x) * calculate_variance(y))
    
@widgets.interact(x=list(cancer_dataset), y=list(cancer_dataset))    
def make_scatterplot(x="mean_radius",y="mean_area"):
    colors=["#e28743", "#1e81b0"]

    corr = calculate_correlation(cancer_dataset[x], cancer_dataset[y])
    index = int(corr > 0.5)
    color = colors[index]
    plt.title(r"correlation: $\rho = ${:.3f}".format(corr), color= color, size=TITLE_FONT)
    sns.scatterplot(data=cancer_dataset, x=x, y=y, alpha=0.5, color=color);


### Let's split the variables up by their category (also called it's label).

Based on our available data, we're not that interested in what the descriptive statistics are on the individual columns. 

In [None]:
def compare_diagnoses_by_variable(variable: str, dataframe: pd.DataFrame = cancer_dataset):
    """Accepts column name to generate basic descriptions"""
    return dataframe[variable].groupby("diagnosis").describe()

In [None]:
@widgets.interact(variable=list(cancer_dataset))
def comparison_wrapper(variable="mean_radius"):
    return compare_diagnoses_by_variable(variable)

### Let's run a t-test on these categories

In [None]:
@widgets.interact(variable=list(cancer_dataset))
def run_ttest(variable="mean_radius"):
    cat1 = cancer_dataset.xs("M", level=1)
    cat2 = cancer_dataset.xs("B", level=1)

    tstat, pvalue = ttest(cat1[variable], cat2[variable])
    print(f"p-value: {pvalue:.2e}")

In [None]:
@widgets.interact(variable=list(cancer_dataset), n=(3,100))
def run_ttest(variable="mean_radius", n=3):
    seed = 1
    cat1 = cancer_dataset.xs("M", level=1).sample(n, random_state=seed)
    cat2 = cancer_dataset.xs("B", level=1).sample(n)

    tstat, pvalue = ttest(cat1[variable], cat2[variable])
    print(f"p-value: {pvalue:.2e}")

### Considering the assumptions of the Student's T-Test, when is it not appropriate?

### Intuition: Central limit theorem

You can ignore the code below! It is setting up an experiment.

In [None]:
def get_random_datasets(n, rng, repetitions=200):
    averages = []
    all_nums = []
    for i in range(0, repetitions):
        nums = [generate_random_numbers(rng) for _ in range(0,n)]
        all_nums += nums
        averages.append(np.mean(nums))
    return all_nums, averages
    
def generate_histograms_clt(axs, n=10, rng="uniform"):
    """build 2x2 matrix of histograms"""

    all_nums_fixed, averages_fixed = get_random_datasets(10, rng)
    build_paired_histograms(averages_fixed, all_nums_fixed, 10, rng, axs, column=0)
    
    all_nums, averages = get_random_datasets(n, rng)
    build_paired_histograms(averages, all_nums, n, rng, axs, column=1)
    
def build_paired_histograms(averages, all_nums, n, rng, axs, column):
    colors=["#1e81b0", "#e28743"]
    color = colors[column]
    axs[0,column].set_title(f"random samples")
    axs[1,column].set_title(f"sample averages")
    axs[0,column].text(0.9, 0.9, f"n:{n}",
                       verticalalignment='bottom', horizontalalignment='right',
                       transform=axs[0,column].transAxes,
                       color=color, fontsize=LABEL_FONT)
    axs[1,column].text(0.9, 0.9, f"n:{n}",
                       verticalalignment='bottom', horizontalalignment='right',
                       transform=axs[1,column].transAxes,
                       color=color, fontsize=LABEL_FONT)
    sns.histplot(all_nums, ax=axs[0, column], color = color, stat="probability", kde=True)
    sns.histplot(averages, bins=10, ax=axs[1, column], color = color, stat="probability", kde=True)
    axs[1, column].set_xlim(0,10)
    
    
def generate_random_numbers(generator = "uniform"):
    """generate random numbers with a mean of 5"""
    if generator == "uniform": return random.uniform(0,10)
    elif generator == "exponential": return random.expovariate(1/5)
    elif generator == "normal": return random.gauss(5,2)
    
def format_plots(axs):
    for ax in axs.flat:
        title = ax.get_title()
        ax.set_title(title, fontweight="bold", size=LABEL_FONT)
        ax.set_ylabel('Proportion (Probability)', fontsize = LABEL_FONT)
        ax.set_xlabel('Number', fontsize = LABEL_FONT)
        ax.tick_params(labelsize=TICK_FONT)
        

In [None]:
@widgets.interact_manual(n=(3,100), generator=["uniform","exponential","normal"])
def demonstrate_clt(n=10, generator="exponential"):
    random.seed(10)
    fig, axs = plt.subplots(2, 2, figsize=FIG_SIZE, constrained_layout=True)
    fig.suptitle(f"Random number generator (distribution): {generator}",fontweight="bold", size=TITLE_FONT)
    generate_histograms_clt(axs, n, generator)
    format_plots(axs)