In [1]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Matplotlib and seaborn for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Linear Regression to verify implementation
from sklearn.linear_model import LinearRegression

# Scipy for statistics
import scipy
import arviz as az

# PyMC3 for Bayesian Inference
# import pymc3 as pm

In [2]:
# Importing dataset
columns = ['lenght_of_stay', 'age', 'infection_risk', 'routine_culturing_ratio', 'routine_xray_ratio', 'num_beds', 'med_school_affil', 'region', 'avg_census', 'num_nurses', 'avelbl_services']
df = pd.read_csv("data/dataset.txt", sep=" ", header=None, names=columns)
df.head()

Unnamed: 0,lenght_of_stay,age,infection_risk,routine_culturing_ratio,routine_xray_ratio,num_beds,med_school_affil,region,avg_census,num_nurses,avelbl_services
1,7.13,55.7,4.1,9.0,39.6,279,2,4,207,241,60.0
2,8.82,58.2,1.6,3.8,51.7,80,2,2,51,52,40.0
3,8.34,56.9,2.7,8.1,74.0,107,2,3,82,54,20.0
4,8.95,53.7,5.6,18.9,122.8,147,2,4,53,148,40.0
5,11.2,56.5,5.7,34.5,88.9,180,2,1,134,151,40.0


In [3]:
# Separating covariates and target
if 'infection_risk' in df.columns:
    Y = df.pop("infection_risk").astype(float)
X = df.astype(float)

In [4]:
# Standardizing the covariates
X -= X.mean()
X /= X.std()

# Getting shape
N, D = X.shape
print("N:", N, "D:", D)
X

N: 113 D: 10


Unnamed: 0,lenght_of_stay,age,routine_culturing_ratio,routine_xray_ratio,num_beds,med_school_affil,region,avg_census,num_nurses,avelbl_services
1,-1.317487,0.553196,-0.663714,-2.170455,0.139139,0.418947,1.621862,0.101641,0.486497,1.107879
2,-0.433344,1.113532,-1.171789,-1.545579,-0.892791,0.418947,-0.359440,-0.912930,-0.870624,-0.207836
3,-0.684462,0.822157,-0.751650,-0.393947,-0.752780,0.418947,0.631211,-0.711316,-0.856263,-1.523551
4,-0.365333,0.104927,0.303583,2.126216,-0.545357,0.418947,1.621862,-0.899922,-0.181293,-0.207836
5,0.811780,0.732503,1.827808,0.375529,-0.374233,0.418947,-1.350091,-0.373126,-0.159751,-0.207836
...,...,...,...,...,...,...,...,...,...,...
109,1.125677,0.127340,-0.653943,1.821524,1.653326,-2.365816,-0.359440,1.623498,2.123659,1.298657
110,-0.077595,-0.881265,2.560609,-0.554039,-0.799450,0.418947,0.631211,-0.802368,-0.913707,-1.332773
111,-1.019285,0.822157,-0.351053,-0.708967,-0.638697,0.418947,1.621862,-0.691805,-0.267459,1.298657
112,4.337888,0.665263,1.036383,0.525293,3.022318,-2.365816,-1.350091,3.899779,1.678466,1.298657


In [5]:
linReg = LinearRegression().fit(X, Y)

In [8]:
linReg.intercept_

4.354867256637168

In [6]:
linReg.coef_

array([ 0.46303086,  0.04495977,  0.54698301,  0.24460538, -0.61192755,
        0.2018794 ,  0.30036656,  0.43496863,  0.28744209,  0.35375799])