# HW 4. Linear Regression

In [77]:
import numpy as np
import scipy as sci
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

## One-variable regression 

In [78]:
cols = ["person", "height, inch", "weight, Ib"]
df = pd.read_table("data/T3_1_HEIGHTWT.DAT", header=None, sep="\s+", names=cols)

In [79]:
df['height, cm'] = df['height, inch'] * 2.54
df['weight, kg'] = df['weight, Ib'] * 0.45359237

In [80]:
df

Unnamed: 0,person,"height, inch","weight, Ib","height, cm","weight, kg"
0,1,69,153,175.26,69.399633
1,2,74,175,187.96,79.378665
2,3,68,155,172.72,70.306817
3,4,70,135,177.8,61.23497
4,5,72,172,182.88,78.017888
5,6,67,150,170.18,68.038855
6,7,66,115,167.64,52.163123
7,8,70,137,177.8,62.142155
8,9,76,200,193.04,90.718474
9,10,68,130,172.72,58.967008


In [81]:
x = df['height, inch']
y = df['weight, Ib']

In [82]:
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()

In [83]:
model.summary()

0,1,2,3
Dep. Variable:,"weight, Ib",R-squared:,0.791
Model:,OLS,Adj. R-squared:,0.779
Method:,Least Squares,F-statistic:,67.97
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,1.59e-07
Time:,17:15:53,Log-Likelihood:,-84.962
No. Observations:,20,AIC:,173.9
Df Residuals:,18,BIC:,175.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-467.0372,76.730,-6.087,0.000,-628.242,-305.833
"height, inch",8.8417,1.072,8.244,0.000,6.589,11.095

0,1,2,3
Omnibus:,1.247,Durbin-Watson:,2.49
Prob(Omnibus):,0.536,Jarque-Bera (JB):,0.993
Skew:,0.307,Prob(JB):,0.609
Kurtosis:,2.098,Cond. No.,1380.0


## Multivariate regression

In [84]:
cols = ["patient", "rel weight", "fasting plasma glucose", "glucose intolerance", "insulin response", "insulin resistance"]
df = pd.read_table("data/T3_4_DIABETES.DAT", header=None, sep="\s+", names=cols)

In [85]:
df

Unnamed: 0,patient,rel weight,fasting plasma glucose,glucose intolerance,insulin response,insulin resistance
0,1,0.81,80,356,124,55
1,2,0.95,97,289,117,76
2,3,0.94,105,319,143,105
3,4,1.04,90,356,199,108
4,5,1.0,90,323,240,143
5,6,0.76,86,381,157,165
6,7,0.91,100,350,221,119
7,8,1.1,85,301,186,105
8,9,0.99,97,379,142,98
9,10,0.78,97,296,131,94


In [86]:
y = df.iloc[:, 1]
x = df.iloc[:, 3:]

In [87]:
x = sm.add_constant(x)

In [88]:
model = sm.OLS(y, x).fit()

In [89]:
model.summary()

0,1,2,3
Dep. Variable:,rel weight,R-squared:,0.258
Model:,OLS,Adj. R-squared:,0.205
Method:,Least Squares,F-statistic:,4.877
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.00534
Time:,17:15:53,Log-Likelihood:,36.957
No. Observations:,46,AIC:,-65.91
Df Residuals:,42,BIC:,-58.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6264,0.176,3.565,0.001,0.272,0.981
glucose intolerance,0.0009,0.001,1.729,0.091,-0.000,0.002
insulin response,-0.0010,0.000,-2.308,0.026,-0.002,-0.000
insulin resistance,0.0015,0.000,3.505,0.001,0.001,0.002

0,1,2,3
Omnibus:,2.632,Durbin-Watson:,2.18
Prob(Omnibus):,0.268,Jarque-Bera (JB):,1.883
Skew:,0.053,Prob(JB):,0.39
Kurtosis:,3.985,Cond. No.,4170.0


## Multivariate vector regression

In [95]:
from sklearn import metrics

In [90]:
y = df.iloc[:, 1:3]
x = df.iloc[:, 3:]

In [91]:
model = LinearRegression() 
model.fit(x, y)

LinearRegression()

In [92]:
model.coef_

array([[ 0.00090822, -0.00095571,  0.00148974],
       [ 0.02870058, -0.01272444, -0.00440592]])

In [98]:
y_true, y_pred = y, model.predict(x)

mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
mse=metrics.mean_squared_error(y_true, y_pred) 
r2=metrics.r2_score(y_true, y_pred)

print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))

r2:  0.1373
MAE:  3.3378
MSE:  33.9562
RMSE:  5.8272
