# Script

In [42]:
import numpy as np
import scipy.stats as stats


class lr:

    def __init__(self):
        self._d = None
        self._n = None
        self._con_lvl = 0.95
        self.b = None

    @property
    def d(self):
        return self._d
    
    @property
    def n(self):
        return self._n

    @property
    def confidence_level(self):
        return self._con_lvl

    def fit(self, X, y):
        self.b = np.linalg.pinv(X.T @ X) @ X.T @ y
        self._d = len(self.b) - 1
        self._n = y.shape[0]

    def predict(self, X):
        return X @ self.b

    def variance(self, X, y):
        SSE = np.sum(np.square(y - X @ self.b))
        return SSE / (self._n - self._d - 1)

    def standard_deviation(self, X, y):
        var = self.variance(X, y)
        return np.sqrt(var)

    def significance(self, X, y):
        var = self.variance(X, y)
        std_dev = np.sqrt(var)
        SSE = np.sum(np.square(y - X @ self.b))
        SST = np.sum(np.square(y - np.mean(y)))
        SSR = SST - SSE
        f_stat = (SSR / self._d) / var
        f_pvalue = stats.f.sf(f_stat, self._d, self._n - self._d - 1)
        cov_matrix = np.linalg.pinv(X.T @ X) * var
        ti_stat = [self.b[i] / (std_dev * np.sqrt(cov_matrix[i, i])) for i in range(self._d + 1)]
        ti_pvalues = [2 * min(stats.t.cdf(i, self._n - self._d - 1), stats.t.sf(i, self._n - self._d - 1)) for i in ti_stat]
        return {
            "f_pvalue": f_pvalue, 
            "ti_pvalues": ti_pvalues
        }

    def relevance(self, X, y):
        SSE = np.sum(np.square(y - X @ self.b))
        SST = np.sum(np.square(y - np.mean(y)))
        SSR = SST - SSE
        R_squared = SSR / SST
        return R_squared
    
    def test_relevance(self, X, y):
        SSE = np.sum(np.square(y - X @ self.b))
        RSE = np.sqrt(SSE / (self._n - 2))
        MSE = (1 / self._n) * SSE
        RMSE = np.sqrt(MSE)
        return {
            "RSE": RSE,
            "MSE": MSE, 
            "RMSE": RMSE
        }

    def pearson(self, X, y):
        kin_geo = stats.pearsonr(X[:,1], X[:,2])
        kin_ine = stats.pearsonr(X[:,1], X[:,3])
        geo_ine = stats.pearsonr(X[:,2], X[:,3])
        return {
            "kin_geo": kin_geo,
            "kin_ine": kin_ine,
            "geo_ine": geo_ine
        }

# Analysis

In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
# from linear_regression import LinearRegression as lr

data = pd.read_csv("../data/Small-diameter-flow.csv", index_col=0)


# Creating sets

data_shuffled = data.sample(frac=1, random_state=42)  # setting the seed for reproducibility

train_indices = round(0.8 * len(data_shuffled))
val_indices = round(0.25 * train_indices)
test_indices = round(0.2 * len(data_shuffled))

test_df = pd.DataFrame(data_shuffled[:test_indices])
train_df = pd.DataFrame(data_shuffled[test_indices:])
val_df = pd.DataFrame(train_df[:val_indices])
train_df = pd.DataFrame(train_df[val_indices:])

X_train = np.column_stack([np.ones(len(train_df)), train_df["Kinematic"], train_df["Geometric"], train_df["Inertial"], train_df["Observer"]])
y_train = train_df["Flow"]

X_val = np.column_stack([np.ones(len(val_df)), val_df["Kinematic"], val_df["Geometric"], val_df["Inertial"], val_df["Observer"]])
y_val = val_df["Flow"]

X_test = np.column_stack([np.ones(len(test_df)), test_df["Kinematic"], test_df["Geometric"], test_df["Inertial"], test_df["Observer"]])
y_test = test_df["Flow"]


# Running the model

model = lr()
model.fit(X_train, y_train)
predictions = model.predict(X_val)

print(f"Number of features: {model.d}")
print(f"Number of rows: {model.n}")     # from X_train
print(f"Variance: {model.variance(X_train, y_train):.2e}")
print(f"Standard deviation: {model.standard_deviation(X_train, y_train):.2e}")

# returns dictionary with f_pvalue and ti_pvalues
sig = model.significance(X_train, y_train)
print(f"Significance of f-statistic: {sig["f_pvalue"]:.2e}")
print(f"Significance of t-statistic for ...\n... Kinematic: {sig["ti_pvalues"][0]:.2e}\n... Geometric: {sig["ti_pvalues"][1]:.2e}\n... Inertial: {sig["ti_pvalues"][2]:.2e}")

print(f"Relevance (R²): {model.relevance(X_train, y_train):.2e}")

# returns dictionary with RSE, MSE and RMSE
t_rel = model.test_relevance(X_train, y_train)
v_rel = model.test_relevance(X_val, y_val)
t_rel = {key: value.item() for key, value in t_rel.items()}
v_rel = {key: value.item() for key, value in v_rel.items()}
print(f"Training relevance ...\n... RSE: {t_rel["RSE"]:.2e}\n... MSE: {t_rel["MSE"]:.2e}\n... RMSE:  {t_rel["RMSE"]:.2e}")
print(f"Validation relevance ...\n... RSE: {v_rel["RSE"]:.2e}\n... MSE: {v_rel["MSE"]:.2e}\n... RMSE:  {v_rel["RMSE"]:.2e}")

# returns Pearson dictionary
r = model.pearson(X_train, y_train)
print(f"Kinematic - Geometric: {r["kin_geo"][0]:.2e} (correlation), {r["kin_geo"][1]:.2e} (p-value)")
print(f"Kinematic - Inertial: {r["kin_ine"][0]:.2e} (correlation), {r["kin_ine"][1]:.2e} (p-value)")
print(f"Geometric - Inertial: {r["geo_ine"][0]:.2e} (correlation), {r["geo_ine"][1]:.2e} (p-value)")

Number of features: 4
Number of rows: 118
Variance: 6.39e-03
Standard deviation: 7.99e-02
Significance of f-statistic: 3.93e-143
Significance of t-statistic for ...
... Kinematic: 1.24e-90
... Geometric: 2.41e-138
... Inertial: 1.54e-226
Relevance (R²): 9.97e-01
Training relevance ...
... RSE: 7.89e-02
... MSE: 6.12e-03
... RMSE:  7.82e-02
Validation relevance ...
... RSE: 4.68e-02
... MSE: 2.16e-03
... RMSE:  4.64e-02
Kinematic - Geometric: 8.69e-01 (correlation), 3.34e-37 (p-value)
Kinematic - Inertial: 9.72e-01 (correlation), 7.72e-75 (p-value)
Geometric - Inertial: 9.20e-01 (correlation), 6.91e-49 (p-value)


# To do

In [55]:
# These should be in the class

n = model.n
SSX = np.sum(np.square(X_train - np.mean(X_train))) / (n - 1)
SST = np.sum(np.square(y_train - np.mean(y_train)))
SXY = np.cov(X_train[:, 1], y_train)[0, 1]
# konfidensintervall β1 ±2 ·SE(ˆβ1)
# kolla signifikansen på båda observers för sista VG-frågan
# standardisera variablerna 

# Har frågat Raphael om detta

In [45]:
Syy = (n * np.sum(np.square(y_train)) - np.square(np.sum(y_train))) / n             # från Raphaels code-along
TSS = np.sum(np.square(y_train - np.mean(y_train)))                                 # från andra källor

Sxx1 = (n * np.sum(np.square(X_train)) - np.square(np.sum(X_train))) / n            # från Raphaels code-along
Sxx2 = (n * np.sum(np.square(X_train)) - np.square(np.sum(X_train))) / (n * (n-1))  # från handledning
SSX = np.sum(np.square(X_train - np.mean(X_train)))                                 # från andra källor

Sxx_ISLP = np.sum(np.square(X_train - np.mean(X_train))) / (n - 1)  # page 183 in ISLP

print(f"Syy: {Syy}\nTSS: {TSS}")
print()
print(f"Sxx1: {Sxx1}\nSxx2: {Sxx2}\nSSX: {SSX}")
print(f"ISLP: {Sxx_ISLP}")


Syy: 257.13475005176133
TSS: 257.13475005176053

Sxx1: -8789.660897107407
Sxx2: -75.12530681288384
SSX: 16640.532278483977
ISLP: 142.22677161097417


In [54]:
def calculate_covariance(X, y):
    n = len(y)
    mean_X = np.mean(X, axis=0)
    mean_y = np.mean(y)
    y = np.array(y)  # Convert y to numpy array
    covariance = np.sum((X - mean_X) * (y[:, np.newaxis] - mean_y), axis=0) / (n - 1)
    return covariance

covariance_X_y = calculate_covariance(X_train, y_train)
print("Covariance between X and y:", covariance_X_y)

Covariance between X and y: [0.         0.64369374 0.67668221 1.06690326 0.17733483]
