# HW 4. Linear Regression

In [27]:
import numpy as np
import scipy as sci
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

## One-variable regression 

In [28]:
cols = ["person", "height, inch", "weight, Ib"]
df = pd.read_table("data/T3_1_HEIGHTWT.DAT", header=None, sep="\s+", names=cols)

In [29]:
df['height, cm'] = df['height, inch'] * 2.54
df['weight, kg'] = df['weight, Ib'] * 0.45359237

In [30]:
df

Unnamed: 0,person,"height, inch","weight, Ib","height, cm","weight, kg"
0,1,69,153,175.26,69.399633
1,2,74,175,187.96,79.378665
2,3,68,155,172.72,70.306817
3,4,70,135,177.8,61.23497
4,5,72,172,182.88,78.017888
5,6,67,150,170.18,68.038855
6,7,66,115,167.64,52.163123
7,8,70,137,177.8,62.142155
8,9,76,200,193.04,90.718474
9,10,68,130,172.72,58.967008


In [31]:
x = df['height, inch']
y = df['weight, Ib']

In [32]:
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()

In [33]:
model.summary()

0,1,2,3
Dep. Variable:,"weight, Ib",R-squared:,0.791
Model:,OLS,Adj. R-squared:,0.779
Method:,Least Squares,F-statistic:,67.97
Date:,"Thu, 08 Jun 2023",Prob (F-statistic):,1.59e-07
Time:,10:00:52,Log-Likelihood:,-84.962
No. Observations:,20,AIC:,173.9
Df Residuals:,18,BIC:,175.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-467.0372,76.730,-6.087,0.000,-628.242,-305.833
"height, inch",8.8417,1.072,8.244,0.000,6.589,11.095

0,1,2,3
Omnibus:,1.247,Durbin-Watson:,2.49
Prob(Omnibus):,0.536,Jarque-Bera (JB):,0.993
Skew:,0.307,Prob(JB):,0.609
Kurtosis:,2.098,Cond. No.,1380.0


## Multivariate regression

In [34]:
cols = ["patient", "rel weight", "fasting plasma glucose", "glucose intolerance", "insulin response", "insulin resistance"]
cols = ["patient", "y1", "y2", "x1", "x2", "x3"]
df = pd.read_table("data/T3_4_DIABETES.DAT", header=None, sep="\s+", names=cols)

In [35]:
df.head()

Unnamed: 0,patient,y1,y2,x1,x2,x3
0,1,0.81,80,356,124,55
1,2,0.95,97,289,117,76
2,3,0.94,105,319,143,105
3,4,1.04,90,356,199,108
4,5,1.0,90,323,240,143


In [36]:
y = df.iloc[:, 1]
x = df.iloc[:, 3:]

In [37]:
x = sm.add_constant(x)

In [38]:
model = sm.OLS(y, x).fit()

In [39]:
model.summary()

0,1,2,3
Dep. Variable:,y1,R-squared:,0.258
Model:,OLS,Adj. R-squared:,0.205
Method:,Least Squares,F-statistic:,4.877
Date:,"Thu, 08 Jun 2023",Prob (F-statistic):,0.00534
Time:,10:00:56,Log-Likelihood:,36.957
No. Observations:,46,AIC:,-65.91
Df Residuals:,42,BIC:,-58.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6264,0.176,3.565,0.001,0.272,0.981
x1,0.0009,0.001,1.729,0.091,-0.000,0.002
x2,-0.0010,0.000,-2.308,0.026,-0.002,-0.000
x3,0.0015,0.000,3.505,0.001,0.001,0.002

0,1,2,3
Omnibus:,2.632,Durbin-Watson:,2.18
Prob(Omnibus):,0.268,Jarque-Bera (JB):,1.883
Skew:,0.053,Prob(JB):,0.39
Kurtosis:,3.985,Cond. No.,4170.0


## Multivariate vector regression

In [40]:
from sklearn import metrics

In [41]:
y = df.iloc[:, 1:3]
x = df.iloc[:, 3:]

In [42]:
model = LinearRegression() 
model.fit(x, y)

LinearRegression()

In [43]:
model.coef_

array([[ 0.00090822, -0.00095571,  0.00148974],
       [ 0.02870058, -0.01272444, -0.00440592]])

In [44]:
y_true, y_pred = y, model.predict(x)

mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
mse=metrics.mean_squared_error(y_true, y_pred) 
r2=metrics.r2_score(y_true, y_pred)

print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))

r2:  0.1373
MAE:  3.3378
MSE:  33.9562
RMSE:  5.8272


In [45]:
y = df.iloc[:, 1:3]
x = df.iloc[:, 3:]

In [52]:
model = sm.MANOVA.from_formula("y1+y2~x1+x2+x3", data=df)

In [53]:
print(model.mv_test())

                 Multivariate linear model
                                                            
------------------------------------------------------------
       Intercept        Value  Num DF  Den DF F Value Pr > F
------------------------------------------------------------
          Wilks' lambda 0.4891 2.0000 41.0000 21.4134 0.0000
         Pillai's trace 0.5109 2.0000 41.0000 21.4134 0.0000
 Hotelling-Lawley trace 1.0446 2.0000 41.0000 21.4134 0.0000
    Roy's greatest root 1.0446 2.0000 41.0000 21.4134 0.0000
------------------------------------------------------------
                                                            
------------------------------------------------------------
           x1           Value  Num DF  Den DF F Value Pr > F
------------------------------------------------------------
          Wilks' lambda 0.9310 2.0000 41.0000  1.5188 0.2310
         Pillai's trace 0.0690 2.0000 41.0000  1.5188 0.2310
 Hotelling-Lawley trace 0.0741 2.0000 41.0