In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [1]:
from sklearn import datasets

# Conjunto de datos de diabetes
X, y = datasets.load_diabetes(return_X_y=True)

In [None]:
def data_split(X, y, train_size):
  # TODO: Particionar la data de tal forma que los primeros elementos
  #       dados por train_size formen parte de la data de entrenamiento
  #       y el resto forme parte de la data de prueba
  X_train, y_train = X[:train_size], y[:train_size]
  X_test, y_test = X[train_size:], y[train_size:]

  return X_train, y_train, X_test, y_test

In [None]:
print(X.shape, y.shape)

(442, 10) (442,)


In [None]:
X_train, y_train, X_test, y_test = data_split(X, y, 300)
print("Tamano de la data de entrenamiento:", X_train.shape, y_train.shape)
print("Tamano de la data de prueba:", X_test.shape, y_test.shape)

Tamano de la data de entrenamiento: (300, 10) (300,)
Tamano de la data de prueba: (142, 10) (142,)


In [None]:
# Definimos la función de pérdida
def mse(y_true, y_pred): #[n, 1], [n,1]
    error = (y_true - y_pred)**2 / 2   # [n,1]
    return error.mean()

In [None]:
# Definimos función para aumentar una columna de unos a la data
def add_intercept(X): # X -> [n,m]  n = samples, m = features
    n, m = X.shape
    ones_column = np.ones(n).reshape(-1, 1) # [n, 1]
    X_new = np.hstack([ones_column, X])
    return X_new

In [None]:
X.shape


(442, 10)

In [None]:
#Llamamos a la función de agregación de intercepto
X_train_ones = add_intercept(X_train)
print('Tamaño de los datos: ', X_train_ones.shape)

Tamaño de los datos:  (300, 11)


In [None]:
#Definimos la función para hallar la matriz de pesos analiticamente
def normal_equations(X, y):
    #pseudoinverse --> aproximación de la inversa (siempre da resultado)
    return np.linalg.pinv(X.T @ X) @ X.T @ y

In [None]:
# Hallamos los mejores parámetros usando la función normal_equations
w_norm_eq = normal_equations(X_train_ones, y_train)

print('Mejores parámetros de solución analítica: ')
print(w_norm_eq)

Mejores parámetros de solución analítica: 
[ 152.34779643  -16.57338609 -254.66343751  560.9894609   278.90965232
 -393.45557666   97.08855335  -18.9842756   169.46616165  632.96847103
  114.21833048]


In [None]:
#Definimos la función de predicción

def predict(X, w):
  return X @ w #[n, m + 1] @ [m + 1, 1] --> [n, 1]

In [None]:
#Hacemos la predicción para los datos de prueba
y_train_pred = predict(X_train_ones, w_norm_eq)

In [None]:
y_train_pred.shape

(300,)

In [None]:
print("MSE de entrenamiento: ", mse(y_train_pred, y_train))

MSE de entrenamiento:  1461.499563163104


In [None]:
# TODO: Adicionar columna de unos a la data de prueba
X_test_ones = add_intercept(X_test)

# TODO: Realizar predicciones usando los parámetros hallados
#       con la data de entrenamiento
y_test_pred = predict(X_test_ones, w_norm_eq)

In [None]:
print("MSE de prueba: ", mse(y_test_pred, y_test))

MSE de prueba:  1397.2935004171466


## Regresión Lineal usando scikit-learn

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Initializar LinearRegression
linear_regressor = LinearRegression()

In [None]:
# Entrenar modelo con x_train e y_train
linear_regressor.fit(X_train, y_train)

In [None]:
w_lr = linear_regressor.coef_
b_lr = linear_regressor.intercept_
print(f'Pesos: {w_lr}')
print(f'Intercept: {b_lr}')
# Mejores parámetros de solución analítica (Normal equations implementación desde 0):


Pesos: [ -16.57338609 -254.66343751  560.9894609   278.90965232 -393.45557666
   97.08855335  -18.9842756   169.46616165  632.96847103  114.21833048]
Intercept: 152.34779643323938


In [None]:
y_train_pred = linear_regressor.predict(X_train)

In [None]:
print(f"MSE de entrenamiento: {mse(y_train, y_train_pred):.3f}")

MSE de entrenamiento: 1461.500


In [None]:
y_test_pred = linear_regressor.predict(X_test)

In [None]:
print(f"MSE de prueba: {mse(y_test, y_test_pred):.3f}")

MSE de prueba: 1397.294


## Con train_test_split (TTS)

In [None]:
SEED = 42
np.random.seed(SEED)


In [None]:
# Importamos método de model_selection
from sklearn.model_selection import train_test_split

# Realizamos el particionamiento dado un porcentaje
_X_train1, X_test1, _y_train1, y_test1 = train_test_split(X, y, train_size=0.8, random_state= SEED)

print('Tamaño original del dataset: ', X.shape)
print('Tamaño de la data de entrenamiento: ', _X_train1.shape)
print('Tamaño de la data de prueba: ', X_test1.shape)

Tamaño original del dataset:  (442, 10)
Tamaño de la data de entrenamiento:  (353, 10)
Tamaño de la data de prueba:  (89, 10)


In [None]:
#len(_X_train1)
len(_y_train1)

353

In [None]:
# Initializar LinearRegression
linear_regressorTTS = LinearRegression()

In [None]:
# Entrenar modelo con x_train e y_train
linear_regressorTTS.fit(_X_train1,_y_train1)

In [None]:
w_lr = linear_regressorTTS.coef_
b_lr = linear_regressorTTS.intercept_
print(f'Pesos: {w_lr}')
print(f'Intercept: {b_lr}')
# Mejores parámetros de solución analítica (Normal equations implementación desde 0):


Pesos: [  37.90402135 -241.96436231  542.42875852  347.70384391 -931.48884588
  518.06227698  163.41998299  275.31790158  736.1988589    48.67065743]
Intercept: 151.34560453985995


In [None]:
y_train_pred1 = linear_regressorTTS.predict(_X_train1)

In [None]:
print(f"MSE de entrenamiento: {mse(_y_train1, y_train_pred1):.3f}")

MSE de entrenamiento: 1434.275


In [None]:
y_test_pred1 = linear_regressorTTS.predict(X_test1)

In [None]:
print(f"MSE de prueba: {mse(y_test1, y_test_pred1):.3f}")

MSE de prueba: 1450.097


In [None]:
p = X_train.shape[1]
p
n = len(y_test1)
n

89

## Usando R2, R2 Ajustado y la métrica Mean Squared Error

In [None]:
from sklearn.metrics import r2_score
import statsmodels.api as sm
r2 = r2_score(y_test1, y_test_pred1)
print("Coeficiente de determinación R^2:", r2)

Coeficiente de determinación R^2: 0.4526027629719195


In [None]:
# Calcular el R^2 tradicional
r2 = r2_score(y_test1, y_test_pred1)

# Calcular el R^2 ajustado manualmente
n = len(y_test1)  # número de observaciones
p = X_train.shape[1]  # número de predictores (características)

r2_ajustado = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print("Coeficiente de determinación ajustado R^2:", r2_ajustado)

Coeficiente de determinación ajustado R^2: 0.38242363001960145


In [None]:
from sklearn.metrics import mean_squared_error
m_s_e = (mean_squared_error(y_test1, y_test_pred1))/2
m_s_e

1450.096814246741

## Estadisticas del modelo

In [None]:
import statsmodels.api as sm

# Añadir un intercepto a las variables independientes
X_train_const = sm.add_constant(X_train)

# Ajustar el modelo de regresión lineal
model = sm.OLS(y_train, X_train_const).fit()

# Obtener un resumen completo del modelo
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.515
Model:                            OLS   Adj. R-squared:                  0.498
Method:                 Least Squares   F-statistic:                     30.65
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.92e-40
Time:                        21:34:58   Log-Likelihood:                -1622.7
No. Observations:                 300   AIC:                             3267.
Df Residuals:                     289   BIC:                             3308.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        152.3478      3.196     47.671      0.0

In [None]:
import statsmodels.api as sm
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# Cargar el conjunto de datos
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target)

# Añadir un intercepto a las variables independientes
X = sm.add_constant(X)

# Ajustar el modelo de regresión lineal
model = sm.OLS(y, X).fit()

# Resumen del modelo (incluye ANOVA)
#anova_table = sm.stats.anova_lm(model, typ=2)
#print(anova_table)

##Evaluacion de premisas del modelo

In [None]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [None]:
# Evaluacion de Linealidad

# visualize the relationship between the features and the response using scatterplots
p = sns.pairplot(load_diabetes, x_vars=['s1'], y_vars='target', height=4)



Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [None]:
# Evaluacion de Homocedasticidad
import matplotlib.pyplot as plt

# Gráfico de los residuos
plt.scatter(y_test_pred, y_test - y_test_pred)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Predicciones')
plt.ylabel('Residuos')
plt.title('Gráfico de los residuos')
plt.show()

In [None]:
# Evaluacion de Autocorrelacion de residuos
# Calcular los residuos
residuos = y_test - y_test_pred

# Gráfico ACF (Autocorrelation Function)
sm.graphics.tsa.plot_acf(residuos, lags=40)
plt.show()

In [None]:
# Evaluacion de Normalidad de Residuos
import statsmodels.api as sm

# Gráfico Q-Q de los residuos
sm.qqplot(residuos, line='45')
plt.title('Q-Q Plot de los Residuos')
plt.show()

In [None]:
import seaborn as sns

# Histograma de Residuos
sns.histplot(residuos, kde=True)
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.title('Histograma de Residuos')
plt.show()

In [None]:
# Calcular los residuos
residuos = y_test - y_test_pred

# Calcular los residuos estandarizados
residuos_estandarizados = residuos / np.std(residuos)

In [None]:
# Gráfico de Residuos Estandarizados vs. Predicciones
plt.figure(figsize=(8, 6))
plt.scatter(y_test_pred, residuos_estandarizados)
plt.axhline(y=0, color='r', linestyle='--')
plt.axhline(y=2, color='g', linestyle='--')
plt.axhline(y=-2, color='g', linestyle='--')
plt.xlabel('Predicciones')
plt.ylabel('Residuos Estandarizados')
plt.title('Gráfico de Residuos Estandarizados vs. Predicciones')
plt.show()