In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import scipy . linalg as lng # linear algebra from scipy library
from scipy . spatial import distance # load distance function
from sklearn import preprocessing as preproc # load preprocessing function
import math
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import make_regression

# seaborn can be used to "prettify" default matplotlib plots by importing and setting as default
import seaborn as sns
sns.set() # Set searborn as default

In [2]:
data = pd.read_csv('case1Data.csv')
y = data['y']
X = data.drop('y', axis=1)
# Get number of observations (n) and number of independent variables (p)
[n, p] = np.shape(X)

print('Missing values in X:', X.isnull().sum().sum())

Missing values in X: 1489


In [3]:
# impute missing values with mean
X_imputed_mean = X.fillna(X.mean())
#X_imputed_mean = X.fillna(X.median())


#imputer = IterativeImputer(max_iter=10, random_state=0)
#X_imputed_mean = imputer.fit_transform(X)
#print("\nImputed Data:\n", X_imputed[:5])


In [4]:
def ols_solver(X, y):
    betas, res, rnk, s = lng.lstsq(X, y)
    
    return betas, res, rnk, s

def ols_analytical(X, y):
    inner_product = np.linalg.inv(X.T @ X)
    outer_product = X.T @ y
    betas = inner_product @ outer_product
    return betas

In [5]:
# linear solver
beta_solv, res, rnk, s = ols_solver(X_imputed_mean, y)
yhat_solv = X_imputed_mean @ beta_solv

In [6]:
# numerical solution
beta_ana = ols_analytical(X_imputed_mean,y)
yhat_num = X_imputed_mean @ beta_ana

In [7]:
# Include offset / intercept
off = np.ones(n) 
M = np.c_[off, X_imputed_mean]

# linear solver
beta_solv, res_solv, rnk, s = ols_solver(M, y)
yhat_solv = M @ beta_solv

# analytical solution
beta_ana = ols_analytical(M,y)
yhat_ana = M @ beta_ana


In [8]:
# Manual calculation of residuals for solver
res_solv_man = (y - yhat_solv) ** 2    
mse_solv_man = np.mean(res_solv_man)

# Manual calculation of residuals for analytical
res_ana = (y - yhat_ana) ** 2    
mse_ana = np.mean(res_ana)

print(f'mse from lstsq: {res_solv/len(y)}') # lng.lstsq returns sum of squared residuals, so we divide by num of obs to get mean
print(f'mse from solver: {mse_solv_man}')
print(f'mse from nummerical: {mse_ana}')

mse from lstsq: []
mse from solver: 1.559893301548539e-24
mse from nummerical: 913344.1361461278


mean:
mse from lstsq: []
mse from solver: 1.559893301548539e-24
mse from nummerical: 913344.1361461278

median:
mse from lstsq: []
mse from solver: 4.275424157906148e-25
mse from nummerical: 275387.5226166055

iterative:
mse from lstsq: []
mse from solver: 2.933700066389306e-20
mse from nummerical: 119649.32292734919



In [9]:
# analytical version
rss = np.sum(res_ana)
print(f'RSS: {rss}')
tss = np.sum((y - np.mean(y))** 2)
print(f'TSS: {tss}')
r2 = (1 - rss / tss) * 100
print(f'R2: {r2}')

RSS: 91334413.61461279
TSS: 520031.4508322642
R2: -17463.24804364047
