# Analysis

## Dependencies and data standardisation

In [1]:
import numpy as np
import pandas as pd
from linear_regression import LinearRegression as lr

data = pd.read_csv("../data/Small-diameter-flow.csv", index_col=0)
data_std = data.copy()
for column in ["Flow", "Kinematic", "Geometric", "Inertial"]:
    mean = np.mean(data[column])
    sample_std_dev = np.std(data[column], ddof=1) # since this is not the population
    data_std[column] = (data[column] - mean) / sample_std_dev

## Creating prediction sets

In [2]:
data_shuffled = data_std.sample(frac=1, random_state=42)  # setting the seed for reproducibility

train_indices = round(0.8 * len(data_shuffled))
val_indices = round(0.25 * train_indices)
test_indices = round(0.2 * len(data_shuffled))

test_df = pd.DataFrame(data_shuffled[:test_indices])
train_df = pd.DataFrame(data_shuffled[test_indices:])
val_df = pd.DataFrame(train_df[:val_indices])
train_df = pd.DataFrame(train_df[val_indices:])

X_train = np.column_stack([np.ones(len(train_df)), train_df["Kinematic"], train_df["Geometric"], train_df["Inertial"], train_df["Observer"]])
y_train = train_df["Flow"]

X_val = np.column_stack([np.ones(len(val_df)), val_df["Kinematic"], val_df["Geometric"], val_df["Inertial"], val_df["Observer"]])
y_val = val_df["Flow"]

X_test = np.column_stack([np.ones(len(test_df)), test_df["Kinematic"], test_df["Geometric"], test_df["Inertial"], test_df["Observer"]])
y_test = test_df["Flow"]

## Running the model and printing outputs

In [3]:
model = lr()
model.fit(X_train, y_train)
predictions = model.predict(X_val)

print(f"Number of features: {model.d}")
print(f"Number of rows: {model.n}") # from X_train
print()
print(f"Variance: {model.variance(X_train, y_train):.2e}")
print(f"Standard deviation: {model.standard_deviation(X_train, y_train):.2e}")
print()
sig = model.significance(X_train, y_train) # returns dictionary with f_pvalue and ti_pvalues
print(f"Significance of f-statistic: {sig["f_pvalue"]:.2e}")
print(f"Significance of t-statistic for ...\n... Kinematic: {sig["ti_pvalues"][0]:.2e}\n... Geometric: {sig["ti_pvalues"][1]:.2e}\n... Inertial:  {sig["ti_pvalues"][2]:.2e}")
print()
print(f"Relevance (R²): {model.r_squared(X_train, y_train):.2e}")
print()
t_rel = model.relevance(X_train, y_train) # returns dictionary with RSE, MSE and RMSE
t_r2 = model.r_squared(X_train, y_train)
v_rel = model.relevance(X_val, y_val)
v_r2 = model.r_squared(X_val, y_val)
t_rel = {key: value.item() for key, value in t_rel.items()}
v_rel = {key: value.item() for key, value in v_rel.items()}
print(f"Training relevance ...\n... RSE:  {t_rel["RSE"]:.2e}\n... MSE:  {t_rel["MSE"]:.2e}\n... RMSE: {t_rel["RMSE"]:.2e}")
print()
print(f"Validation relevance ...\n... RSE:  {v_rel["RSE"]:.2e}\n... MSE:  {v_rel["MSE"]:.2e}\n... RMSE: {v_rel["RMSE"]:.2e}")
print()
r = model.pearson(X_train) # returns a correlation matrix
cols = ["Intercept", "Kinematic", "Geometric", "Inertial", "Observer"]
print(f"Correlation pairs ...")
for i in range(len(cols)):
    if i == 0:
        continue
    else:
        for j in range(i + 1, len(cols)):
            print(f"... {cols[i]} - {cols[j]}: {r[i, j]:.4f}")
print()
ci = model.confidence_intervals(X_train, y_train) # returns the confidence intervals for all parameters
print("Confidence intervals for each coefficient ...")
for i, margin in enumerate(ci):
    print(f"... β{i}: {model.b[i]:.4f} ± {margin:.4f} | Lower: {model.b[i] - margin:.4f}, Upper: {model.b[i] + margin:.4f}")

Number of features: 4
Number of rows: 118

Variance: 2.96e-03
Standard deviation: 5.44e-02

Significance of f-statistic: 3.93e-143
Significance of t-statistic for ...
... Kinematic: 1.75e-75
... Geometric: 3.60e-157
... Inertial:  2.06e-245

Relevance (R²): 9.97e-01

Training relevance ...
... RSE:  5.37e-02
... MSE:  2.84e-03
... RMSE: 5.32e-02

Validation relevance ...
... RSE:  3.19e-02
... MSE:  9.99e-04
... RMSE: 3.16e-02

Correlation pairs ...
... Kinematic - Geometric: 0.8688
... Kinematic - Inertial: 0.9720
... Kinematic - Observer: 0.1783
... Geometric - Inertial: 0.9196
... Geometric - Observer: 0.2242
... Inertial - Observer: 0.1774

Confidence intervals for each coefficient ...
... β0: -0.0175 ± 0.0011 | Lower: -0.0187, Upper: -0.0164
... β1: 0.2968 ± 0.0036 | Lower: 0.2932, Upper: 0.3003
... β2: 1.1006 ± 0.0022 | Lower: 1.0984, Upper: 1.1028
... β3: -0.3989 ± 0.0045 | Lower: -0.4034, Upper: -0.3944
... β4: 0.0191 ± 0.0017 | Lower: 0.0174, Upper: 0.0208


  c /= stddev[:, None]
  c /= stddev[None, :]


# To do

1. jag får fel på self._n när jag växlar mellan träningsdata och valideringsdata (t ex i relevance())

def _update_n_d(self, X, y):
    self._n = y.shape[0]
    self._d = X.shape[1] - 1

En okej lösning på problemet, I guess. Borde kolla vad som är conventional. 

2. dubbelkolla data leakage så att inte samma rader finns test, validering, test 

3. Normalisera datan? 

4. Analysera observer bias 

5. Lös RuntimeWarning

def pearson(self, X):
    epsilon = 1e-10  # Small value to avoid division by zero
    corr_matrix = np.corrcoef(X, rowvar=False)
    stddev = np.sqrt(np.diag(corr_matrix))
    stddev[stddev < epsilon] = epsilon  # Replace small stddev values with epsilon
    corr_matrix /= stddev[:, None]
    corr_matrix /= stddev[None, :]
    return corr_matrix

Något med att noll eller nära noll värden ger varningen så att jag kan addera en epsilon för att det inte ska bli så

6. Plotta

7. SSE återkommer flera gånger - DRY

# Test

In [4]:
# ökorgsdg
ökorgsdg_df = data.copy()

X_ökorgsdg = np.column_stack([np.ones(len(ökorgsdg_df)), ökorgsdg_df["Kinematic"], ökorgsdg_df["Geometric"], ökorgsdg_df["Inertial"], ökorgsdg_df["Observer"]])
y_ökorgsdg = ökorgsdg_df["Flow"]

model = lr()
model.fit(X_ökorgsdg, y_ökorgsdg)

ci = model.confidence_intervals(X_ökorgsdg, y_ökorgsdg) # returns the confidence intervals for all parameters
print("Confidence intervals for each coefficient ...")
for i, margin in enumerate(ci):
    print(f"... β{i}: {model.b[i]:.4f} ± {margin:.4f} | Lower: {model.b[i] - margin:.4f}, Upper: {model.b[i] + margin:.4f}")

Confidence intervals for each coefficient ...
... β0: -2.5838 ± 0.1004 | Lower: -2.6842, Upper: -2.4834
... β1: 0.8700 ± 0.0116 | Lower: 0.8584, Upper: 0.8816
... β2: 3.6032 ± 0.0079 | Lower: 3.5953, Upper: 3.6110
... β3: -0.7519 ± 0.0093 | Lower: -0.7612, Upper: -0.7426
... β4: 0.0169 ± 0.0028 | Lower: 0.0141, Upper: 0.0196
