# Part 1: Motivation for Statistical Analysis

 - Most industry projects promise returns of ~15%, however:
  - One-eighth of offshore developments fall into 'disaster' category due to cost growth, time slippage, underproduction
 - Record worse for megaprojects (~62% cost overrun for mining megaprojects).
 
## Extra motivator
Data analytics are at the centre of all major industries. Being somewhat fluent in this field will be advantageous for any chosen career path.

## Example: Dissolved oxygen curve

In this example, we want to approximate the dissolved oxygen concentration in water from experimental data. We can apply the following equation to try and describe what's happening.

$$C_l \, = \, C_s ( 1 - e^{-kLa \cdot t}) $$

Where $C_l$ is the concentration of oxygen i nthe liquid (ppm), $C_s$ is the saturation concentration (the maximum concentration of oxygen in the liquid **under the particular experimental conditions** (e.g. for a certain temperature, atmospheric pressure, water purity). $kLa$ is the overall mass transfer coefficient ($s^{-1}$) and $t$ is time. 

We need to find the values of $C_s$ and $kLa$ which minimise the error between the model and experimental values of $C_l$.

In [1]:
from ipywidgets import interact, interactive, fixed, interact_manual, FloatSlider, IntSlider
import ipywidgets as widgets
from ipywidgets.embed import embed_minimal_html
from IPython.display import display

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
font = {'family' : 'DejaVu Sans',
        'weight' : 'bold',
        'size'   : 20}

plt.rc('font', **font)

import pandas as pd
from scipy.stats import norm

In [5]:
import random
kLa_slider = FloatSlider(
    min=0.01, 
    max=0.05, 
    step=0.01, 
    value=0.01,
    description='kLa value:',
    continuous_update=False)
Cs_slider = FloatSlider(min=5.0, max=10.0, step=0.1, value=5, continuous_update=False)

def DOModel(t, kLa, Cs):
    return Cs*(1 - np.exp(-kLa*t))

@interact(kLa=kLa_slider, Cs=Cs_slider)
def interactive_nlreg(kLa, Cs):

    # Define the 'raw' data
    time = np.linspace(0,200,200)
    noise = [random.uniform(-1,1)*0.5 for _ in range(len(time))]
    y =  8.10*(1 - np.exp(-0.03*time)) + noise

    # Define the model with sliders
    model_output = DOModel(time, kLa, Cs)
    plt.figure(figsize=(12,9))
    plt.plot(time, model_output, c='red', lw=3, label='DO model')
    plt.plot(time, y, c='black', lw=2, label='Raw DO data')
    plt.xlim([0, 200])
    plt.ylim([0, 9])
    plt.title("Dissolved oxygen concentration (ppm)")
    plt.ylabel("DO (ppm)", fontsize=16, fontweight='bold')
    plt.xlabel("time (s)", fontsize=16, fontweight='bold')
    plt.legend(fontsize=16)
    sse = np.sum((model_output - y)**2)
    
    #print(f"Sum of squared error between model and raw data is {sse}")
    interactive.layout.height = '600px'
    plt.text(0, -1.5, "SSE = " + str(int(sse)), fontsize=20, fontweight='bold')

interactive(children=(FloatSlider(value=0.01, continuous_update=False, description='kLa value:', max=0.05, min…