In [None]:

from sklearn import datasets, linear_model

from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [None]:
X, y = datasets.load_diabetes(return_X_y=True)

X.shape

In [None]:
# Use a single feature
X_single = X[:, 6].reshape(-1,1)
ntrain = len(X_single) - int(len(X_single) * .90)
X_train = X_single[:-ntrain, :]
y_train = y[:-ntrain]
X_test = X_single[-ntrain:, :]
y_test = y[-ntrain:]

print(X_train.shape)
print(X_test.shape)

In [None]:
# Very basic linear regression
lin_reg = linear_model.LinearRegression()
lin_reg.fit(X_train, y_train)
pred = lin_reg.predict(X_test)

In [None]:
# Plot 
plt.scatter(X_test, y_test)
plt.plot(X_test.ravel(), pred)
print('coeff', lin_reg.coef_)
print('R squared', r2_score(y_test, pred))

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Difference between numpy and sklearn here
x = np.array([2, 3, 4, 5, 6, 7, 7, 8, 9, 11, 12])
y2 = np.array([18, 16, 15, 17, 20, 23, 25, 28, 31, 30, 29])

X_test = x.reshape(-1,1)
y_test = y2

#create scatterplot to visualize relationship between x and y

poly3 = PolynomialFeatures(degree=3, include_bias=False)
poly3_features =poly3.fit_transform(X_test)
poly3_reg = linear_model.LinearRegression()
poly3_reg.fit(poly3_features, y_test)


# sklearn
pred = poly3_reg.predict(poly3_features)
plt.scatter(X_test, y_test)
plt.plot(X_test, pred)

#numpy
coefs = np.polyfit(X_test.flatten(), y_test.flatten(), 3)
plt.scatter(X_test,y_test)
X_seq = np.linspace(X_test.min(),X_test.max(),300).reshape(-1,1)
plt.plot(X_seq, np.polyval(coefs, X_seq), color='purple')



In [None]:
# Multi
ntrain = len(X) - int(len(X) * .90)
X_train = X[:-ntrain, :]
y_train = y[:-ntrain]

X_test = X[-ntrain:, :]
y_test = y[-ntrain:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Multi regression
multi_reg = linear_model.LinearRegression()
multi_reg.fit(X_train, y_train)
pred = multi_reg.predict(X_test)
print('R squared', r2_score(y_test, pred))

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
import pandas as pd

In [None]:
NUM_BINS = 20
plt.hist(y_train, bins=NUM_BINS)


In [None]:
df_train = pd.DataFrame(np.hstack((X_train, y_train.reshape(-1,1))))
df_train = df_train.rename(lambda x: f'X{x}', axis='columns')
print(df_train.head())

# Create some groups
counts, edges = np.histogram(y_train, bins=NUM_BINS)
groups = np.digitize(y_train, edges) - 1
groups


In [None]:
mstring = ' + '.join(f'X{i}' for i in range(10))
mstring = 'X10 ~ ' + mstring
print(mstring)
# md = smf.mixedlm(mstring, data=df_train, groups=groups)
model = sm.MixedLM.from_formula(mstring, groups=groups, data=df_train)


In [None]:
mdf = model.fit()

In [None]:
mdf.summary()

In [None]:
from statsmodels.tools.eval_measures import rmse

re = mdf.random_effects
rex = [np.dot(model.exog_re_li[j], re[k]) for (j, k) in enumerate(model.group_labels)]
rex = np.concatenate(rex)
yp = mdf.fittedvalues + rex

yp


In [None]:
rmse(y_train, yp)