In [799]:
# LinearRegression.py
import numpy as np
from scipy import stats

class LinearRegression: 
    def __init__(self, data_set, response):
        self.data_set = data_set 
        self.response = response 
    
    @property 
    def X(self):
        X = self.data_set.drop(columns=[self.response])
        X = np.column_stack([np.ones(X.shape[0]), X])
        return X
    
    @property
    def Y(self):
        Y = self.data_set[self.response]
        Y = np.array(Y)
        return Y
    
    @property
    def n(self):
        return self.X.shape[0]
    
    @property
    def d(self):
        return self.X.shape[1]-1
    
    @property
    def column_names(self):
        features = list(self.data_set.drop(columns=[self.response]).columns)
        column_names = { "X":features, "Y":[self.response], }
        return column_names
    
    # Ordinary Least Square 
    @property 
    def b(self):
        X = self.X 
        Y = self.Y
        return np.linalg.pinv( X.T @ X) @ X.T @ Y
        
    # SSE | Residual Sum of Squares | Sum of Squared Error
    # The closer this is to zero the more accuret the prediction (in theory)
    @property 
    def SSE(self):
        X, Y, b = self.X, self.Y, self.b
        return  sum( np.square( Y - X @ b ) )
    
    # SSR | Explained Sum of Squares | Regression Sum of Squares
    @property
    def SSR(self):
        X, Y_mean, b = self.X, self.Y.mean(), self.b
        return sum( np.square( X @ b - Y_mean ) )
    
    # SST | Syy | Total Sum of Squares
    @property 
    def SST(self):
        Y = self.Y
        Y_mean = Y.mean() 
        return sum( np.square(Y-Y_mean) )

    def print_all(self):
        all = f"""
G
1. Number of Features
   {self.d}

2. Sample Size
   {self.n}

3. Variance  
   {self.var()}

4. Standard Deviation 
    {self.std()}

5. Significance of the Regression 
    {self.sig().split("\n")[0]}

6. Relevance of the Regression
    {self.rel()}

VG
1. Significance tests on Individual Variables
    { str().join( f"{row}\n{str():<6}" for row in self.sig_var().split("\n") ) }

2. A function or method that calculates the Pearson number
    { str().join( f"{row}\n{str():<4}" for row in self.r().split("\n") ) }
         """
        print(all)

    # The Method that Calculates The Variance 
    # sigma^2 == SEE divded by the Degrees of Freedom
    # On average how far our alculated responses are from the regression line
    def var(self):
        SSE, n, d = self.SSE, self.n, self.d 
        return SSE/(n - d - 1)

    
    # The Method that Calculated The Standard Deviation | S | sigma
    # Meassure the same thing as the Variances but in a smaller unit
    def std(self):
        var = self.var()
        return np.sqrt(var)
    
    # The Method that reports The Significance of the Regression
    # The closer the pvalue is to zero the more 
    # confidently we can reject the null-hypothesis (H0)
    # If the null-hypothesis is true then it means that 
    # there is no relationship between the features and the response
    # In order to reject the H0-hypothesis we want the p-value of at least 0.05 aka 5%
    # this would mean that there is a 95% chances that our features effect the response
    def sig(self): 
        SSR, d, n, std = self.SSR, self.d, self.n, self.std()
        
        # This is probably and inbetween value, it doesn't mean anything
        # it shouldn't be part of the printout
        sig_statistic = (SSR/d)/std 
     
        # Survival Function of the F-Distrubution
        p_significance = stats.f.sf(sig_statistic, d, n-d-1)
        
        # Pearsons deal with the T-Distrubution

        #X, Y, b = self.X, self.Y, self.b
        #pearson = stats.pearsonr(np.square(X @ b),Y)
        #return f"statistic:{sig_statistic}, pvalue:{p_significance}\n{pearson}"
        result = str() 
        if p_significance < 0.0001: 
            result = "The p-value low enough to confidently reject the null-hypothesis"
        elif p_significance < 0.001: 
            result = "The p-value low enough to, with some confidence reject the null-hypothesis "    
        elif p_significance < 0.05: 
            result = "The p-value is just low enough to reject the null-hypothesis"
        else:
            result = "The p-value is to high to confidetly reject the null-hypothesis"

        return f"pvalue: {p_significance}\n{result}"
    
    # The method that reports The Relevance of Regression | R2 
    # Reports how big percent of the calculated responses that falls withn 
    # our normal distribution aka how big of a range of responses that our model can 
    # reliably predict. So if our R2 value is 0.90 than that means that of the test
    # cases the normal distribution of our model covers 90% of the responses 
    def rel(self):
        SSR, SST = self.SSR, self.SST
        return SSR/SST


    # VG 

    # Significance of the Variables(Coefficients? + Bias?)
    def sig_var(self):   
        X, b, d, n, std, var, X_names = self.X, self.b, self.d, self.n, self.std(), self.var(), self.column_names["X"]
        X_names.insert(0,"Bias")

        # Variance/Covariance Matrix
        c = np.linalg.pinv( (X.T @ X) )*var

        # It doesn't matter that we get nan values because 
        # where only intrested in the center values aka c[i,i] values 
        # Significans Statisitca Array(?)
        ssa = [ b[i]/(std * np.sqrt(c[i,i])) for i in range(c.shape[1])]
        cdf = stats.t.cdf(ssa, n-d-1)
        sf =  stats.t.sf(ssa, n-d-1)
        p = [ 2 * min(cdf[idx], sf[idx]) for idx in range(len(ssa)) ]
        result = str().join( f"p-value {X_names[idx]:<10}: {p[idx]}\n" for idx in range(len(p))  )
        return result

    # The Method that calculates the Pearson number between all pairs of parameters
    def r(self):
        X, Y, column_names = self.X, self.Y, self.column_names 
        
        result = list()
        
        # Include Y (change X to XY)
        # Remvoes the bias, add the response
        #XY = np.column_stack( [X[:,1:], Y])
        #names = [*column_names["X"], *column_names["Y"]]

        #Remvoes the bias
        X = X[:,1:]
        names = column_names["X"]

  
        for idx in range(len(names)):
            for idy in range(idx):
                if idy == idx:
                    continue 
                p = stats.pearsonr(X.T[idx], X.T[idy])      
                result.append(f"{names[idx]:>10} VS {names[idy]:<10} : Statistics = {p[0]:<20} pvalue = {p[1]}\n")
            
        return str().join(result[::-1])

    # The method that calculates the Confidence Interval
    
    def con_int(self): 
        X, b, n, d, var, rel = self.X, self.b, self.n, self.d, self.var(), self.rel()
        # ska vi välja confidence leven baserat på R2 ???
        # är α == R2? 
        # βˆ i ± tα/2σˆ 2√cii
        
        c = np.linalg.pinv( (X.T @ X) )*var
        σ2 = var
        α = 1-rel
        i = 1
        t = n-d-1
        #t = stats.t(rel/2, n-d-1)
        
        
        return b[i] + t*(α/2) * σ2 * np.sqrt(c[i,i])
        # confidence level sigma 
        # p_value == nummer av extreme värden?

    # The method that calculates the Confidence Interval V2 
    
    def con_int_2(self):
        X, n, var = self.X, self.n, self.var()
        #X=X[:,1]
        Sxx = (n*np.sum(np.square(X)) - np.square(np.sum(X)))/n
        X_mean = X.mean()
        se_intercept = var * ((1/n)+np.square(X_mean)/Sxx)
        return np.sqrt(se_intercept)
    
   

        

In [800]:
# main.ipynb
import pandas as pd
path = "../Resources/" 
data_set = pd.read_csv(path+"Advertising.csv")
data_set.drop(columns="Unnamed: 0", inplace=True)
lr = LinearRegression(data_set, "sales")

# testa med e istället f
print(lr.con_int())
print(lr.con_int_2())
lr.Y


0.08568360240228182
0.10220685815302728


array([22.1, 10.4,  9.3, 18.5, 12.9,  7.2, 11.8, 13.2,  4.8, 10.6,  8.6,
       17.4,  9.2,  9.7, 19. , 22.4, 12.5, 24.4, 11.3, 14.6, 18. , 12.5,
        5.6, 15.5,  9.7, 12. , 15. , 15.9, 18.9, 10.5, 21.4, 11.9,  9.6,
       17.4,  9.5, 12.8, 25.4, 14.7, 10.1, 21.5, 16.6, 17.1, 20.7, 12.9,
        8.5, 14.9, 10.6, 23.2, 14.8,  9.7, 11.4, 10.7, 22.6, 21.2, 20.2,
       23.7,  5.5, 13.2, 23.8, 18.4,  8.1, 24.2, 15.7, 14. , 18. ,  9.3,
        9.5, 13.4, 18.9, 22.3, 18.3, 12.4,  8.8, 11. , 17. ,  8.7,  6.9,
       14.2,  5.3, 11. , 11.8, 12.3, 11.3, 13.6, 21.7, 15.2, 12. , 16. ,
       12.9, 16.7, 11.2,  7.3, 19.4, 22.2, 11.5, 16.9, 11.7, 15.5, 25.4,
       17.2, 11.7, 23.8, 14.8, 14.7, 20.7, 19.2,  7.2,  8.7,  5.3, 19.8,
       13.4, 21.8, 14.1, 15.9, 14.6, 12.6, 12.2,  9.4, 15.9,  6.6, 15.5,
        7. , 11.6, 15.2, 19.7, 10.6,  6.6,  8.8, 24.7,  9.7,  1.6, 12.7,
        5.7, 19.6, 10.8, 11.6,  9.5, 20.8,  9.6, 20.7, 10.9, 19.2, 20.1,
       10.4, 11.4, 10.3, 13.2, 25.4, 10.9, 10.1, 16

In [801]:
data_set = pd.read_csv(path+"Small-diameter-flow.csv") 
data_set.drop(columns=["Unnamed: 0","Observer"], inplace=True)
data_set
lr = LinearRegression(data_set, "Flow")
print( lr.con_int() )
lr.print_all()


0.8688009956311592

G
1. Number of Features
   3

2. Sample Size
   198

3. Variance  
   0.006308685487583493

4. Standard Deviation 
    0.07942723391622984

5. Significance of the Regression 
    pvalue: 7.998510997422736e-141

6. Relevance of the Regression
    0.9971212473220574

VG
1. Significance tests on Individual Variables
    p-value Bias      : 1.3694429113257573e-146
      p-value Kinematic : 2.2799778946075336e-236
      p-value Geometric : 0.0
      p-value Inertial  : 1.9192831125684836e-242
      
      

2. A function or method that calculates the Pearson number
      Inertial VS Geometric  : Statistics = 0.9183300308547001   pvalue = 7.951572627158216e-81
      Inertial VS Kinematic  : Statistics = 0.9686707504997814   pvalue = 1.588545639896567e-120
     Geometric VS Kinematic  : Statistics = 0.8631350761065918   pvalue = 4.5604633624399433e-60
    
    
         


In [802]:
data_set = pd.read_csv(path+"Small-diameter-flow.csv") 
data_set.drop(columns=["Unnamed: 0"], inplace=True)
data_set
lr = LinearRegression(data_set, "Flow")
#print( lr.con_int() )
lr.print_all()


G
1. Number of Features
   4

2. Sample Size
   198

3. Variance  
   0.006272292538356673

4. Standard Deviation 
    0.07919780639864132

5. Significance of the Regression 
    pvalue: 1.7265182348384377e-139

6. Relevance of the Regression
    0.9971526073277638

VG
1. Significance tests on Individual Variables
    p-value Bias      : 3.2273690263899853e-147
      p-value Kinematic : 5.730580151466907e-236
      p-value Geometric : 0.0
      p-value Inertial  : 1.1628066959545507e-241
      p-value Observer  : 2.3422411107265474e-44
      
      

2. A function or method that calculates the Pearson number
      Observer VS Inertial   : Statistics = 0.12198107336291035  pvalue = 0.08690459468332266
      Observer VS Geometric  : Statistics = 0.17519913369993184  pvalue = 0.013557203955629581
      Observer VS Kinematic  : Statistics = 0.10322658943843983  pvalue = 0.14784118487116096
      Inertial VS Geometric  : Statistics = 0.9183300308547001   pvalue = 7.951572627158216e-81
    