In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from linear_regression import LinearRegression as lr

data = pd.read_csv("../data/Small-diameter-flow.csv", index_col=0)


# Creating sets

data_shuffled = data.sample(frac=1, random_state=42)  # setting the seed for reproducibility

train_indices = round(0.8 * len(data_shuffled))
val_indices = round(0.25 * train_indices)
test_indices = round(0.2 * len(data_shuffled))

test_df = pd.DataFrame(data_shuffled[:test_indices])
train_df = pd.DataFrame(data_shuffled[test_indices:])
val_df = pd.DataFrame(train_df[:val_indices])
train_df = pd.DataFrame(train_df[val_indices:])

X_train = np.column_stack([np.ones(len(train_df)), train_df["Kinematic"], train_df["Geometric"], train_df["Inertial"], train_df["Observer"]])
y_train = train_df["Flow"]

X_val = np.column_stack([np.ones(len(val_df)), val_df["Kinematic"], val_df["Geometric"], val_df["Inertial"], val_df["Observer"]])
y_val = val_df["Flow"]

X_test = np.column_stack([np.ones(len(test_df)), test_df["Kinematic"], test_df["Geometric"], test_df["Inertial"], test_df["Observer"]])
y_test = test_df["Flow"]


# Running the model

model = lr()
model.fit(X_train, y_train)
predictions = model.predict(X_val)

print(f"Number of features: {model.d}")
print(f"Number of rows: {model.n}")     # from X_train
print(f"Variance: {model.variance(X_train, y_train):.2e}")
print(f"Standard deviation: {model.standard_deviation(X_train, y_train):.2e}")

# returns dictionary with f_pvalue and ti_pvalues
sig = model.significance(X_train, y_train)
print(f"Significance of f-statistic: {sig["f_pvalue"]:.2e}")
print(f"Significance of t-statistic for ...\n... Kinematic: {sig["ti_pvalues"][0]:.2e}\n... Geometric: {sig["ti_pvalues"][1]:.2e}\n... Inertial: {sig["ti_pvalues"][2]:.2e}")

print(f"Relevance (R²): {model.relevance(X_train, y_train):.2e}")

# returns dictionary with RSE, MSE and RMSE
t_rel = model.test_relevance(X_train, y_train)
v_rel = model.test_relevance(X_val, y_val)
t_rel = {key: value.item() for key, value in t_rel.items()}
v_rel = {key: value.item() for key, value in v_rel.items()}
print(f"Training relevance ...\n... RSE: {t_rel["RSE"]:.2e}\n... MSE: {t_rel["MSE"]:.2e}\n... RMSE:  {t_rel["RMSE"]:.2e}")
print(f"Validation relevance ...\n... RSE: {v_rel["RSE"]:.2e}\n... MSE: {v_rel["MSE"]:.2e}\n... RMSE:  {v_rel["RMSE"]:.2e}")

# returns Pearson dictionary
r = model.pearson(X_train, y_train)
print(f"Kinematic - Geometric: {r["kin_geo"][0]:.2e} (correlation), {r["kin_geo"][1]:.2e} (p-value)")
print(f"Kinematic - Inertial: {r["kin_ine"][0]:.2e} (correlation), {r["kin_ine"][1]:.2e} (p-value)")
print(f"Geometric - Inertial: {r["geo_ine"][0]:.2e} (correlation), {r["geo_ine"][1]:.2e} (p-value)")

# To do

In [None]:
# These should be in the class

n = model.n
SSX = (n * np.sum(np.square(X_train)) - np.square(np.sum(X_train))) / n          # Sxx, variance in X
SXY = (n * np.sum(X_train[:, 1]@y_train) - (np.sum(X_train[:, 1]) * np.sum(y_train))) / n          # Sxy, covariance between X and y
slope = SXY / SSX
